update ChangeLog

Change-Id: Ib9504997d00c9f72cff30aa699fe1998b6be940d
enc/vp8enci.h: update version number
2025-07-15 13:29:54 +02:00 · 2014-10-14 20:43:43 +02:00 · 2014-10-14 20:40:02 +02:00 · 2014-10-14 12:02:17 +02:00 · 2014-10-14 12:02:17 +02:00 · 2014-10-14 12:02:17 +02:00
186 changed files with 58259 additions and 5992 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,5 @@
+.gitattributes export-ignore
+.gitignore export-ignore
+.mailmap export-ignore
+*.pdf -text -diff
+*.ppm -text -diff
--- a/.gitignore
+++ b/.gitignore
@ -1,18 +1,32 @@
 *.l[ao]
-*.o
+*.[ao]
+*.pc
 .deps
 .libs
 /aclocal.m4
+/ar-lib
 /autom4te.cache
 /compile
 /config.*
 /configure
 /depcomp
+/dist
 /install-sh
 /libtool
 /ltmain.sh
 /missing
+/mkinstalldirs
 /stamp-h1
 Makefile
 Makefile.in
-examples/[cd]webp
+examples/[cdv]webp
+examples/gif2webp
+examples/webpmux
+src/webp/config.h*
+src/webp/stamp-h1
+/output
+/doc/output
+*.idb
+*.pdb
+/iosbuild
+/WebP.framework
--- a/.mailmap
+++ b/.mailmap
@ -0,0 +1,8 @@
+<johann.koenig@duck.com> <johannkoenig@google.com>
+Mikołaj Zalewski <mikolajz@google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+<pascal.massimino@gmail.com> <skal@google.com>
+Vikas Arora <vikasa@google.com>
+<vikasa@google.com> <vikasa@gmail.com>
+<vikasa@google.com> <vikaas.arora@gmail.com>
+<slobodan.prijic@imgtec.com> <Slobodan.Prijic@imgtec.com>
--- a/22
+++ b/22
@ -1,7 +1,25 @@
 Contributors:
+- Charles Munger (clm at google dot com)
+- Christian Duvivier (cduvivier at google dot com)
+- Djordje Pesut (djordje dot pesut at imgtec dot com)
+- James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
+- Johann (johann dot koenig at duck dot com)
+- Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
+- Jyrki Alakuijala (jyrki at google dot com)
+- levytamar82 (tamar dot levy at intel dot com)
+- Lou Quillio (louquillio at google dot com)
+- Mans Rullgard (mans at mansr dot com)
+- Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
+- Noel Chromium (noel at chromium dot org)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
- pierre.php@gmail.com
- Somnath Banerjee (somnath at google dot com)
+- Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
+- Pierre Joye (pierre dot php at gmail dot com)
+- Scott LaVarnway (slavarnway at google dot com)
+- Scott Talbot (s at chikachow dot org)
+- Slobodan Prijic (slobodan dot prijic at imgtec dot com)
+- Somnath Banerjee (somnath dot banerjee at gmail dot com)
+- Timothy Gu (timothygu99 at gmail dot com)
+- Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
--- a/Android.mk
+++ b/Android.mk
@ -1,35 +1,109 @@
-LOCAL_PATH:= $(call my-dir)
+LOCAL_PATH := $(call my-dir)
+
+WEBP_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD
+
+ifeq ($(APP_OPTIM),release)
+  WEBP_CFLAGS += -finline-functions -ffast-math \
+                 -ffunction-sections -fdata-sections
+  ifeq ($(findstring clang,$(NDK_TOOLCHAIN_VERSION)),)
+    WEBP_CFLAGS += -frename-registers -s
+  endif
+endif

 include $(CLEAR_VARS)
+
+ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
+  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
+  # instructions to be generated for armv7a code. Instead target the neon code
+  # specifically.
+  NEON := c.neon
+else
+  NEON := c
+endif
+
 LOCAL_SRC_FILES := \
-	src/dec/bits.c \
-	src/dec/dsp.c \
-	src/dec/frame.c \
-	src/dec/idec.c \
-	src/dec/quant.c \
-	src/dec/tree.c \
-	src/dec/vp8.c \
-	src/dec/webp.c \
-	src/dec/yuv.c \
-	src/enc/analysis.c \
-	src/enc/bit_writer.c \
-	src/enc/config.c \
-	src/enc/dsp.c \
-	src/enc/filter.c \
-	src/enc/frame.c \
-	src/enc/iterator.c \
-	src/enc/picture.c \
-	src/enc/quant.c \
-	src/enc/syntax.c \
-	src/enc/tree.c \
-	src/enc/webpenc.c
-
-LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
-                -finline-functions -frename-registers -ffast-math \
-                -s -fomit-frame-pointer -Isrc/webp
+    src/dec/alpha.c \
+    src/dec/buffer.c \
+    src/dec/frame.c \
+    src/dec/idec.c \
+    src/dec/io.c \
+    src/dec/quant.c \
+    src/dec/tree.c \
+    src/dec/vp8.c \
+    src/dec/vp8l.c \
+    src/dec/webp.c \
+    src/dsp/alpha_processing.c \
+    src/dsp/alpha_processing_sse2.c \
+    src/dsp/cpu.c \
+    src/dsp/dec.c \
+    src/dsp/dec_clip_tables.c \
+    src/dsp/dec_mips32.c \
+    src/dsp/dec_neon.$(NEON) \
+    src/dsp/dec_sse2.c \
+    src/dsp/enc.c \
+    src/dsp/enc_avx2.c \
+    src/dsp/enc_mips32.c \
+    src/dsp/enc_neon.$(NEON) \
+    src/dsp/enc_sse2.c \
+    src/dsp/lossless.c \
+    src/dsp/lossless_mips32.c \
+    src/dsp/lossless_neon.$(NEON) \
+    src/dsp/lossless_sse2.c \
+    src/dsp/upsampling.c \
+    src/dsp/upsampling_neon.$(NEON) \
+    src/dsp/upsampling_sse2.c \
+    src/dsp/yuv.c \
+    src/dsp/yuv_mips32.c \
+    src/dsp/yuv_sse2.c \
+    src/enc/alpha.c \
+    src/enc/analysis.c \
+    src/enc/backward_references.c \
+    src/enc/config.c \
+    src/enc/cost.c \
+    src/enc/filter.c \
+    src/enc/frame.c \
+    src/enc/histogram.c \
+    src/enc/iterator.c \
+    src/enc/picture.c \
+    src/enc/picture_csp.c \
+    src/enc/picture_psnr.c \
+    src/enc/picture_rescale.c \
+    src/enc/picture_tools.c \
+    src/enc/quant.c \
+    src/enc/syntax.c \
+    src/enc/token.c \
+    src/enc/tree.c \
+    src/enc/vp8l.c \
+    src/enc/webpenc.c \
+    src/utils/bit_reader.c \
+    src/utils/bit_writer.c \
+    src/utils/color_cache.c \
+    src/utils/filters.c \
+    src/utils/huffman.c \
+    src/utils/huffman_encode.c \
+    src/utils/quant_levels.c \
+    src/utils/quant_levels_dec.c \
+    src/utils/random.c \
+    src/utils/rescaler.c \
+    src/utils/thread.c \
+    src/utils/utils.c \

+LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src

-LOCAL_MODULE:= webp
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm

-include $(BUILD_STATIC_LIBRARY)
+LOCAL_STATIC_LIBRARIES := cpufeatures
+
+LOCAL_MODULE := webp
+
+ifeq ($(ENABLE_SHARED),1)
+  include $(BUILD_SHARED_LIBRARY)
+else
+  include $(BUILD_STATIC_LIBRARY)
+endif
+
+include $(LOCAL_PATH)/examples/Android.mk
+
+$(call import-module,android/cpufeatures)
--- a/2105
+++ b/2105
--- a/Makefile.vc
+++ b/Makefile.vc
@ -1,37 +1,41 @@
 #
 # Stem for static libs and DLLs
 #
-LIB_NAME       = libwebp_a
-LIB_NAME_DEBUG = libwebp_a_debug
-
-#
-# Stem for DLL import libs
-#
-IMPLIB_NAME       = libwebp
-IMPLIB_NAME_DEBUG = libwepb_debug
-
-!IFNDEF DEP_PATH
-DEPS_PATH   = ../../deps
-!ENDIF
+LIBWEBPDECODER_BASENAME = libwebpdecoder
+LIBWEBP_BASENAME = libwebp
+LIBWEBPMUX_BASENAME = libwebpmux
+LIBWEBPDEMUX_BASENAME = libwebpdemux

 !IFNDEF ARCH
-ARCH  = x86
+!IF ! [ cl 2>&1 | find "x86" > NUL ]
+ARCH = x86
+!ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
+ARCH = x64
+!ELSE
+!ERROR Unable to auto-detect toolchain architecture! \
+If cl.exe is in your PATH rerun nmake with ARCH=<arch>.
+!ENDIF
+!ENDIF
+
+!IF "$(ARCH)" == "x86"
+PLATFORM_LDFLAGS = /SAFESEH
 !ENDIF

 #############################################################
 ## Nothing more to do below this line!

-MT         = mt.exe
-CCNODBG    = cl.exe /nologo /O2 /DNDEBUG
-CCDEBUG    = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc /nologo /W3 /EHsc /DWIN32 /FD /c /GS /D_CRT_SECURE_NO_WARNINGS
-LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /SAFESEH /DYNAMICBASE
-CFLAGSLIB  = /DLIBWEBP_STATICLIB
-LNKDLL     = link.exe /DLL
-LNKLIB     = link.exe /lib
-LNKEXE     = link.exe
-LFLAGS     = /nologo /machine:$(ARCH)
-CFLAGS     = $(CFLAGS)
+NOLOGO     = /nologo
+CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
+CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c /GS
+CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
+CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
+LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
+LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
+LNKDLL     = link.exe /DLL $(NOLOGO)
+LNKEXE     = link.exe $(NOLOGO)
+LNKLIB     = lib.exe $(NOLOGO)
+MT         = mt.exe $(NOLOGO)

 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
@ -40,6 +44,11 @@ OUTDIR = ..\obj\
 OUTDIR = $(OBJDIR)
 !ENDIF

+!IF "$(HAVE_AVX2)" == "1"
+CFLAGS = $(CFLAGS) /DWEBP_HAVE_AVX2
+AVX2_FLAGS = /arch:AVX2
+!ENDIF
+
 ##############################################################
 # Runtime library configuration
 !IF "$(RTLIBCFG)" == "static"
@ -54,20 +63,58 @@ DIROBJ = $(DIRBASE)\obj
 DIRLIB = $(DIRBASE)\lib
 DIRINC = $(DIRBASE)\include
 DIRBIN = $(DIRBASE)\bin
+LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME).pdb
+OUTPUT_DIRS = $(DIRBIN) $(DIRINC) $(DIRLIB) \
+              $(DIROBJ)\dec \
+              $(DIROBJ)\demux \
+              $(DIROBJ)\dsp \
+              $(DIROBJ)\enc \
+              $(DIROBJ)\examples \
+              $(DIROBJ)\mux \
+              $(DIROBJ)\utils \

-# release-static
+# Target configuration
 !IF "$(CFG)" == "release-static"
-TARGET = $(LIB_NAME).lib
-LNK    = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
-CC     = $(CCNODBG) $(RTLIB) $(CFLAGSLIB)
-CFGSET = TRUE
+CC             = $(CCNODBG)
+STATICLIBBUILD = TRUE
+!ELSE IF "$(CFG)" == "debug-static"
+CC             = $(CCDEBUG)
+RTLIB          = $(RTLIBD)
+STATICLIBBUILD = TRUE
+LIBWEBPDECODER_BASENAME = $(LIBWEBPDECODER_BASENAME)_debug
+LIBWEBP_BASENAME = $(LIBWEBP_BASENAME)_debug
+LIBWEBPMUX_BASENAME = $(LIBWEBPMUX_BASENAME)_debug
+LIBWEBPDEMUX_BASENAME = $(LIBWEBPDEMUX_BASENAME)_debug
+!ELSE IF "$(CFG)" == "release-dynamic"
+CC        = $(CCNODBG)
+DLLBUILD  = TRUE
+!ELSE IF "$(CFG)" == "debug-dynamic"
+CC        = $(CCDEBUG)
+RTLIB     = $(RTLIBD)
+DLLBUILD  = TRUE
+LIBWEBPDECODER_BASENAME = $(LIBWEBPDECODER_BASENAME)_debug
+LIBWEBP_BASENAME = $(LIBWEBP_BASENAME)_debug
+LIBWEBPMUX_BASENAME = $(LIBWEBPMUX_BASENAME)_debug
+LIBWEBPDEMUX_BASENAME = $(LIBWEBPDEMUX_BASENAME)_debug
 !ENDIF

-# debug-static
-!IF "$(CFG)" == "debug-static"
-TARGET = $(LIB_NAME_DEBUG).lib
-LNK    = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
-CC     = $(CCDEBUG) $(RTLIBD) $(CFLAGSLIB)
+!IF "$(STATICLIBBUILD)" == "TRUE"
+CC     = $(CC) $(RTLIB)
+CFGSET = TRUE
+LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME).lib
+LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME).lib
+LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME).lib
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME).lib
+!ELSE IF "$(DLLBUILD)" == "TRUE"
+DLLC   = webp_dll.c
+DLLINC = webp_dll.h
+DLL_OBJS = $(DIROBJ)\$(DLLC:.c=.obj)
+CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
+LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME)_dll.lib
+LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME)_dll.lib
+LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME)_dll.lib
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME)_dll.lib
+LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME)_dll.pdb
 CFGSET = TRUE
 !ENDIF

@ -75,16 +122,25 @@ CFGSET = TRUE
 # Usage
 #
 !IF "$(CFGSET)" == "FALSE"
-!MESSAGE Usage: nmake /f makefile.vc9 [CFG=<config>] [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
+!MESSAGE Usage: nmake /f Makefile.vc [CFG=<config>]
+!MESSAGE .          [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
+!MESSAGE
 !MESSAGE where <config> is one of:
 !MESSAGE -  release-static                - release static library
 !MESSAGE -  debug-static                  - debug static library
-!MESSAGE -  (empty)                       - perform a clean
+!MESSAGE -  release-dynamic               - release dynamic link library (DLL)
+!MESSAGE -  debug-dynamic                 - debug dynamic link library (DLL)
 !MESSAGE
-!MESSAGE <rtlibcfg> controls the runtime library likage - can be 'static' or 'dynamic'.
-!MESSAGE <target> can be left blank in which case all is assumed
-!MESSAGE <path> is the path where you like to build (obj, bins, etc.)
-!MESSAGE   default to ..\obj\
+!MESSAGE <target> may be:
+!MESSAGE -  clean                         - perform a clean for CFG
+!MESSAGE -  experimental                  - build CFG with experimental
+!MESSAGE .                                  features enabled.
+!MESSAGE - (empty)                        - build libwebp-based targets for CFG
+!MESSAGE - all                            - build (de)mux-based targets for CFG
+!MESSAGE
+!MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
+!MESSAGE OBJDIR is the path where you like to build (obj, bins, etc.),
+!MESSAGE   defaults to ..\obj

 !IF "$(CFG)" != ""
 !MESSAGE
@ -93,99 +149,219 @@ CFGSET = TRUE
 !ENDIF

 #######################
-# Only the clean target can be used if a config was not provided.
+# Rules
 #
-!IF "$(CFGSET)" == "FALSE"
-!MESSAGE
-!MESSAGE No configuration provided - performing a clean.
-clean:
-	@-erase /s *.dll 2> NUL
-	@-erase /s *.exp 2> NUL
-	@-erase /s *.idb 2> NUL
-	@-erase /s *.lib 2> NUL
-	@-erase /s *.obj 2> NUL
-	@-erase /s *.pch 2> NUL
-	@-erase /s *.pdb 2> NUL
-	@-erase /s *.res 2> NUL
-!ELSE
+!IF "$(CFGSET)" == "TRUE"
 # A config was provided, so the library can be built.
 #

-X_OBJS= \
-	$(DIROBJ)\dec\bits.obj \
-	$(DIROBJ)\dec\dsp.obj \
-	$(DIROBJ)\dec\frame.obj \
-	$(DIROBJ)\dec\quant.obj \
-	$(DIROBJ)\dec\tree.obj \
-	$(DIROBJ)\dec\vp8.obj \
-	$(DIROBJ)\dec\webp.obj \
-	$(DIROBJ)\dec\yuv.obj \
-	$(DIROBJ)\dec\idec.obj \
-	$(DIROBJ)\enc\analysis.obj \
-	$(DIROBJ)\enc\bit_writer.obj \
-	$(DIROBJ)\enc\config.obj \
-	$(DIROBJ)\enc\cost.obj \
-	$(DIROBJ)\enc\dsp.obj \
-	$(DIROBJ)\enc\frame.obj \
-	$(DIROBJ)\enc\filter.obj \
-	$(DIROBJ)\enc\iterator.obj \
-	$(DIROBJ)\enc\picture.obj \
-	$(DIROBJ)\enc\quant.obj \
-	$(DIROBJ)\enc\syntax.obj \
-	$(DIROBJ)\enc\tree.obj \
-	$(DIROBJ)\enc\webpenc.obj \
-	$(RESOURCE)
+DEC_OBJS = \
+    $(DIROBJ)\dec\alpha.obj \
+    $(DIROBJ)\dec\buffer.obj \
+    $(DIROBJ)\dec\frame.obj \
+    $(DIROBJ)\dec\idec.obj \
+    $(DIROBJ)\dec\io.obj \
+    $(DIROBJ)\dec\quant.obj \
+    $(DIROBJ)\dec\tree.obj \
+    $(DIROBJ)\dec\vp8.obj \
+    $(DIROBJ)\dec\vp8l.obj \
+    $(DIROBJ)\dec\webp.obj \

-EXAMPLES_OBJS = \
-	$(DIROBJ)\examples\cwebp.obj \
-	$(DIROBJ)\examples\dwebp.obj
+DEMUX_OBJS = \
+    $(DIROBJ)\demux\demux.obj \

-all: $(DIRLIB)\$(TARGET) $(DIRBIN)\dwebp.exe $(DIRBIN)\cwebp.exe
+DSP_DEC_OBJS = \
+    $(DIROBJ)\dsp\alpha_processing.obj \
+    $(DIROBJ)\dsp\alpha_processing_sse2.obj \
+    $(DIROBJ)\dsp\cpu.obj \
+    $(DIROBJ)\dsp\dec.obj \
+    $(DIROBJ)\dsp\dec_clip_tables.obj \
+    $(DIROBJ)\dsp\dec_mips32.obj \
+    $(DIROBJ)\dsp\dec_neon.obj \
+    $(DIROBJ)\dsp\dec_sse2.obj \
+    $(DIROBJ)\dsp\lossless.obj \
+    $(DIROBJ)\dsp\lossless_mips32.obj \
+    $(DIROBJ)\dsp\lossless_neon.obj \
+    $(DIROBJ)\dsp\lossless_sse2.obj \
+    $(DIROBJ)\dsp\upsampling.obj \
+    $(DIROBJ)\dsp\upsampling_neon.obj \
+    $(DIROBJ)\dsp\upsampling_sse2.obj \
+    $(DIROBJ)\dsp\yuv.obj \
+    $(DIROBJ)\dsp\yuv_mips32.obj \
+    $(DIROBJ)\dsp\yuv_sse2.obj \

-$(DIRLIB)\$(TARGET): $(X_OBJS)
-	$(LNK) $(LFLAGS) $(X_OBJS)
-	-xcopy $(DIROBJ)\$(LIB_NAME).dll       $(DIRBIN) /y
-	-xcopy $(DIROBJ)\$(LIB_NAME).lib       $(DIRLIB) /y
-	-xcopy $(DIROBJ)\$(LIB_NAME_DEBUG).dll $(DIRBIN) /y
-	-xcopy $(DIROBJ)\$(LIB_NAME_DEBUG).lib $(DIRLIB) /y
-	-xcopy $(DIROBJ)\$(IMPLIB_NAME).lib    $(DIRLIB) /y
-	-xcopy $(DIROBJ)\$(IMPLIB_NAME_DEBUG).lib $(DIRLIB) /y
-	-xcopy $(DIROBJ)\*.exp                 $(DIRLIB) /y
-	-xcopy $(DIROBJ)\*.pdb                 $(DIRLIB) /y
+DSP_ENC_OBJS = \
+    $(DIROBJ)\dsp\enc.obj \
+    $(DIROBJ)\dsp\enc_avx2.obj \
+    $(DIROBJ)\dsp\enc_mips32.obj \
+    $(DIROBJ)\dsp\enc_neon.obj \
+    $(DIROBJ)\dsp\enc_sse2.obj \

-$(X_OBJS): $(DIROBJ)\enc $(DIROBJ)\dec $(DIRLIB) $(DIRINC) $(DIRBIN)
+EX_FORMAT_DEC_OBJS = \
+    $(DIROBJ)\examples\jpegdec.obj \
+    $(DIROBJ)\examples\metadata.obj \
+    $(DIROBJ)\examples\pngdec.obj \
+    $(DIROBJ)\examples\tiffdec.obj \
+    $(DIROBJ)\examples\webpdec.obj \
+    $(DIROBJ)\examples\wicdec.obj \

-$(EXAMPLES_OBJS): $(DIROBJ)\examples $(DIRLIB)\$(TARGET)
+EX_UTIL_OBJS = \
+    $(DIROBJ)\examples\example_util.obj \

-$(DIROBJ)\enc:
-	@if not exist "$(DIROBJ)\enc" mkdir $(DIROBJ)\enc
+ENC_OBJS = \
+    $(DIROBJ)\enc\alpha.obj \
+    $(DIROBJ)\enc\analysis.obj \
+    $(DIROBJ)\enc\backward_references.obj \
+    $(DIROBJ)\enc\config.obj \
+    $(DIROBJ)\enc\cost.obj \
+    $(DIROBJ)\enc\filter.obj \
+    $(DIROBJ)\enc\frame.obj \
+    $(DIROBJ)\enc\histogram.obj \
+    $(DIROBJ)\enc\iterator.obj \
+    $(DIROBJ)\enc\picture.obj \
+    $(DIROBJ)\enc\picture_csp.obj \
+    $(DIROBJ)\enc\picture_psnr.obj \
+    $(DIROBJ)\enc\picture_rescale.obj \
+    $(DIROBJ)\enc\picture_tools.obj \
+    $(DIROBJ)\enc\quant.obj \
+    $(DIROBJ)\enc\syntax.obj \
+    $(DIROBJ)\enc\token.obj \
+    $(DIROBJ)\enc\tree.obj \
+    $(DIROBJ)\enc\vp8l.obj \
+    $(DIROBJ)\enc\webpenc.obj \

-$(DIROBJ)\examples:
-	@if not exist "$(DIROBJ)\examples" mkdir $(DIROBJ)\examples
+MUX_OBJS = \
+    $(DIROBJ)\mux\muxedit.obj \
+    $(DIROBJ)\mux\muxinternal.obj \
+    $(DIROBJ)\mux\muxread.obj \

-$(DIROBJ)\dec:
-	@if not exist "$(DIROBJ)\dec" mkdir $(DIROBJ)\dec
+UTILS_DEC_OBJS = \
+    $(DIROBJ)\utils\bit_reader.obj \
+    $(DIROBJ)\utils\color_cache.obj \
+    $(DIROBJ)\utils\filters.obj \
+    $(DIROBJ)\utils\huffman.obj \
+    $(DIROBJ)\utils\quant_levels_dec.obj \
+    $(DIROBJ)\utils\rescaler.obj \
+    $(DIROBJ)\utils\random.obj \
+    $(DIROBJ)\utils\thread.obj \
+    $(DIROBJ)\utils\utils.obj \

-$(DIRLIB):
-	@if not exist "$(DIRLIB)" mkdir $(DIRLIB)
+UTILS_ENC_OBJS = \
+    $(DIROBJ)\utils\bit_writer.obj \
+    $(DIROBJ)\utils\huffman_encode.obj \
+    $(DIROBJ)\utils\quant_levels.obj \

-$(DIRINC):
-	@if not exist "$(DIRINC)" mkdir $(DIRINC)
+LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
+               $(UTILS_ENC_OBJS) $(DLL_OBJS)
+LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
+LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)

-$(DIRBIN):
-	@if not exist "$(DIRBIN)" mkdir $(DIRBIN)
+OUT_LIBS = $(LIBWEBPDECODER) $(LIBWEBP)
+OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
+EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe
+
+ex: $(OUT_LIBS) $(OUT_EXAMPLES)
+all: ex $(EXTRA_EXAMPLES)
+$(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(EX_FORMAT_DEC_OBJS)
+$(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj
+$(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj
+$(DIRBIN)\vwebp.exe: $(EX_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
+$(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
+$(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(LIBWEBP)
+$(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
+$(EX_UTIL_OBJS) $(EX_FORMAT_DEC_OBJS): $(OUTPUT_DIRS)
+
+experimental:
+	$(MAKE) /f Makefile.vc \
+	    CFG=$(CFG) \
+	    CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)
+
+$(LIBWEBPDECODER): $(LIBWEBPDECODER_OBJS)
+$(LIBWEBP): $(LIBWEBP_OBJS)
+$(LIBWEBPMUX): $(LIBWEBPMUX_OBJS)
+$(LIBWEBPDEMUX): $(LIBWEBPDEMUX_OBJS)
+
+$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): $(OUTPUT_DIRS)
+
+!IF "$(DLLBUILD)" == "TRUE"
+$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): \
+    $(DIROBJ)\$(DLLINC) $(DIROBJ)\$(DLLC)
+
+{$(DIROBJ)}.c{$(DIROBJ)}.obj:
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$@  $<
+
+$(LIBWEBPMUX): $(LIBWEBP)
+$(LIBWEBPDEMUX): $(LIBWEBP)
+
+$(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
+	$(LNKDLL) /out:$(DIRBIN)\$(@B:_dll=.dll) /implib:$@ $(LFLAGS) $**
+	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
+
+clean::
+	@-erase /s $(DIROBJ)\$(DLLC) $(DIROBJ)\$(DLLINC) 2> NUL
+!ELSE
+$(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
+	$(LNKLIB) /out:$@ $**
+	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
+!ENDIF
+
+$(OUTPUT_DIRS):
+	@if not exist "$(@)" mkdir "$(@)"
+
+# generate a helper include to define WEBP_EXTERN suitable for the DLL build
+$(DIROBJ)\$(DLLINC):
+	@echo #ifndef WEBP_DLL_H_ > $@
+	@echo #define WEBP_DLL_H_ >> $@
+	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
+	@echo #endif  /* WEBP_DLL_H_ */ >> $@
+
+# expose a WebPFree() function for use in managed code
+$(DIROBJ)\$(DLLC): $(DIROBJ)\$(DLLINC)
+	@echo #include ^<stdlib.h^> > $@
+	@echo #include "webp_dll.h" >> $@
+	@echo // This function should be used in place of free() for memory >> $@
+	@echo // returned by the WebP API. >> $@
+	@echo WEBP_EXTERN(void) WebPFree(void* ptr) { >> $@
+	@echo   free(ptr); >> $@
+	@echo } >> $@

 .SUFFIXES: .c .obj .res .exe
-{examples}.c{$(DIROBJ)\examples}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
-{src\dec}.c{$(DIROBJ)\dec}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
-{src\enc}.c{$(DIROBJ)\enc}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
+# File-specific flag builds. Note batch rules take precedence over wildcards,
+# so for now name each file individually.
+$(DIROBJ)\dsp\enc_avx2.obj: src\dsp\enc_avx2.c
+	$(CC) $(CFLAGS) $(AVX2_FLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ \
+	  src\dsp\$(@B).c
+# Batch rules
+{examples}.c{$(DIROBJ)\examples}.obj::
+	$(CC) $(CFLAGS) /Fd$(DIROBJ)\examples\ /Fo$(DIROBJ)\examples\ $<
+{src\dec}.c{$(DIROBJ)\dec}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dec\ $<
+{src\demux}.c{$(DIROBJ)\demux}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\demux\ $<
+{src\dsp}.c{$(DIROBJ)\dsp}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ $<
+{src\enc}.c{$(DIROBJ)\enc}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\enc\ $<
+{src\mux}.c{$(DIROBJ)\mux}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\mux\ $<
+{src\utils}.c{$(DIROBJ)\utils}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\utils\ $<

 {$(DIROBJ)\examples}.obj{$(DIRBIN)}.exe:
-	$(LNKEXE) $(LDFLAGS) /OUT:"$@" $<  ole32.lib windowscodecs.lib shlwapi.lib $(DIRLIB)\$(TARGET)
+	$(LNKEXE) $(LDFLAGS) /OUT:$@ $** \
+	    ole32.lib windowscodecs.lib shlwapi.lib
 	$(MT) -manifest $@.manifest -outputresource:$@;1
 	del $@.manifest

+clean::
+	@-erase /s $(DIROBJ)\*.dll 2> NUL
+	@-erase /s $(DIROBJ)\*.exp 2> NUL
+	@-erase /s $(DIROBJ)\*.idb 2> NUL
+	@-erase /s $(DIROBJ)\*.lib 2> NUL
+	@-erase /s $(DIROBJ)\*.obj 2> NUL
+	@-erase /s $(DIROBJ)\*.pch 2> NUL
+	@-erase /s $(DIROBJ)\*.pdb 2> NUL
+	@-erase /s $(DIROBJ)\*.res 2> NUL
+
 !ENDIF  # End of case where a config was provided.
--- a/90
+++ b/90
@ -1,3 +1,93 @@
+- 10/13/14: version 0.4.2
+  This is a binary compatible release.
+  * Android / gcc build fixes
+  * (Windows) fix reading from stdin and writing to stdout
+  * gif2webp: miscellaneous fixes
+  * fix 'alpha-leak' with lossy compression (issue #220)
+  * the lossless bitstream spec has been amended to reflect the current code
+
+- 7/24/14: version 0.4.1
+  This is a binary compatible release.
+  * AArch64 (arm64) & MIPS support/optimizations
+  * NEON assembly additions:
+    - ~25% faster lossy decode / encode (-m 4)
+    - ~10% faster lossless decode
+    - ~5-10% faster lossless encode (-m 3/4)
+  * dwebp/vwebp can read from stdin
+  * cwebp/gif2webp can write to stdout
+  * cwebp can read webp files; useful if storing sources as webp lossless
+
+- 12/19/13: version 0.4.0
+  * improved gif2webp tool
+  * numerous fixes, compression improvement and speed-up
+  * dither option added to decoder (dwebp -dither 50 ...)
+  * improved multi-threaded modes (-mt option)
+  * improved filtering strength determination
+  * New function: WebPMuxGetCanvasSize
+  * BMP and TIFF format output added to 'dwebp'
+  * Significant memory reduction for decoding lossy images with alpha.
+  * Intertwined decoding of RGB and alpha for a shorter
+    time-to-first-decoded-pixel.
+  * WebPIterator has a new member 'has_alpha' denoting whether the frame
+    contains transparency.
+  * Container spec amended with new 'blending method' for animation.
+
+- 6/13/13: version 0.3.1
+  This is a binary compatible release.
+  * Add incremental decoding support for images containing ALPH and ICCP chunks.
+  * Python bindings via swig for the simple encode/decode interfaces similar to
+    Java.
+
+- 3/20/13: version 0.3.0
+  This is a binary compatible release.
+  * WebPINewRGB/WebPINewYUVA accept being passed a NULL output buffer
+    and will perform auto-allocation.
+  * default filter option is now '-strong -f 60'
+  * encoding speed-up for lossy methods 3 to 6
+  * alpha encoding can be done in parallel to lossy using 'cwebp -mt ...'
+  * color profile, metadata (XMP/EXIF) and animation support finalized in the
+    container.
+  * various NEON assembly additions
+  Tool updates / additions:
+    * gif2webp added
+    * vwebp given color profile & animation support
+    * cwebp can preserve color profile / metadata with '-metadata'
+
+- 10/30/12: version 0.2.1
+  * Various security related fixes
+  * cwebp.exe: fix import errors on Windows XP
+  * enable DLL builds for mingw targets
+
+- 8/3/12: version 0.2.0
+  * Add support for ARGB -> YUVA conversion for lossless decoder
+    New functions: WebPINewYUVA, WebPIDecGetYUVA
+  * Add stats for lossless and alpha encoding
+  * Security related hardening: allocation and size checks
+  * Add PAM output support to dwebp
+
+- 7/19/12: version 0.1.99
+  * This is a pre-release of 0.2.0, not an rc to allow for further
+    incompatible changes based on user feedback.
+  * Alpha channel encode/decode support.
+  * Lossless encoder/decoder.
+  * Add TIFF input support to cwebp.
+  Incompatible changes:
+    * The encode ABI has been modified to support alpha encoding.
+    * Deprecated function WebPINew() has been removed.
+    * Decode function signatures have changed to consistently use size_t over
+      int/uint32_t.
+    * decode_vp8.h is no longer installed system-wide.
+    * cwebp will encode the alpha channel if present.
+
+- 9/19/11: version 0.1.3
+  * Advanced decoding APIs.
+  * On-the-fly cropping and rescaling of images.
+  * SSE2 instructions for decoding performance optimizations on x86 based platforms.
+  * Support Multi-threaded decoding.
+  * 40% improvement in Decoding performance.
+  * Add support for RGB565, RGBA4444 & ARGB image colorspace.
+  * Better handling of large picture encoding.
+
 - 3/25/11: version 0.1.2
  * Incremental decoding: picture can be decoded byte-by-byte if needs be.
  * lot of bug-fixes, consolidation and stabilization
--- a/39
+++ b/39
@ -1,22 +1,23 @@
 Additional IP Rights Grant (Patents)
+------------------------------------

-"This implementation" means the copyrightable works distributed by
-Google as part of the WebM Project.
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.

-Google hereby grants to you a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer, and otherwise run, modify and propagate the contents of this
-implementation of VP8, where such license applies only to those patent
-claims, both currently owned by Google and acquired in the future,
-licensable by Google that are necessarily infringed by this
-implementation of VP8. This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation. If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of VP8 or any code incorporated within this
-implementation of VP8 constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of VP8
-shall terminate as of the date such litigation is filed.
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitutes direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
--- a/419
+++ b/419
@ -4,16 +4,16 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.1.2
+               \____/____/\_____/_____/____/v0.4.2

 Description:
 ============

-WEBP codec: Library to encode and decode images in WebP format. This package
+WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.

-See http://code.google.com/speed/webp
+See http://developers.google.com/speed/webp

 Latest sources are available from http://www.webmproject.org/code/

@ -32,9 +32,11 @@ By running:

  nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output

-the directory output\release-static\x86\bin will contain the tools
-cweb.exe and dweb.exe. The directory output\release-static\x86\lib will
-contains the libwebp static library.
+the directory output\release-static\(x64|x86)\bin will contain the tools
+cwebp.exe and dwebp.exe. The directory output\release-static\(x64|x86)\lib will
+contain the libwebp static library.
+The target architecture (x86/x64) is detected by Makefile.vc from the Visual
+Studio compiler (cl.exe) available in the system path.

 Unix build using makefile.unix:
 -------------------------------
@ -47,22 +49,20 @@ will build the binaries examples/cwebp and examples/dwebp, along
 with the static library src/libwebp.a. No system-wide installation
 is supplied, as this is a simple alternative to the full installation
 system based on the autoconf tools (see below).
-Please refer the makefile.unix for additional details and customizations.
+Please refer to makefile.unix for additional details and customizations.

 Using autoconf tools:
 ---------------------
-./autogen.sh
+When building from git sources, you will need to run autogen.sh to generate the
+configure script.
+
 ./configure
 make
 make install

-Note: In case './configure' step fails, try generating configure & appropriate
-Makefile(s) via command 'aclocal && autoconf && automake -a -c;'.
-
 should be all you need to have the following files

 /usr/local/include/webp/decode.h
-/usr/local/include/webp/decode_vp8.h
 /usr/local/include/webp/encode.h
 /usr/local/include/webp/types.h
 /usr/local/lib/libwebp.*
@ -71,10 +71,52 @@ should be all you need to have the following files

 installed.

-Note: The encoding and decoding libraries are compiled separately
-(as src/dec/libwebpdecode.* and src/dec/libwebpencode.*). They
-can be installed independently using a minor modifications in the
-corresponding Makefile.am configure files (see comments there).
+Note: A decode-only library, libwebpdecoder, is available using the
+'--enable-libwebpdecoder' flag. The encode library is built separately and can
+be installed independently using a minor modification in the corresponding
+Makefile.am configure files (see comments there). See './configure --help' for
+more options.
+
+SWIG bindings:
+--------------
+
+To generate language bindings from swig/libwebp.swig at least swig-1.3
+(http://www.swig.org) is required.
+
+Currently the following functions are mapped:
+Decode:
+  WebPGetDecoderVersion
+  WebPGetInfo
+  WebPDecodeRGBA
+  WebPDecodeARGB
+  WebPDecodeBGRA
+  WebPDecodeBGR
+  WebPDecodeRGB
+
+Encode:
+  WebPGetEncoderVersion
+  WebPEncodeRGBA
+  WebPEncodeBGRA
+  WebPEncodeRGB
+  WebPEncodeBGR
+  WebPEncodeLosslessRGBA
+  WebPEncodeLosslessBGRA
+  WebPEncodeLosslessRGB
+  WebPEncodeLosslessBGR
+
+See swig/README for more detailed build instructions.
+
+Java bindings:
+
+To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
+is necessary for enum support. The output is intended to be a shared object /
+DLL that can be loaded via System.loadLibrary("webp_jni").
+
+Python bindings:
+
+To build the swig-generated Python extension code at least Python 2.6 is
+required. Python < 2.6 may build with some minor changes to libwebp.swig or the
+generated code, but is untested.

 Encoding tool:
 ==============
@ -84,9 +126,13 @@ decoding (dwebp) images.

 The easiest use should look like:
  cwebp input.png -q 80 -o output.webp
-which will convert the input PNG or JPEG file to a WebP one using a
-quality factor of 80 on a 0->100 scale (0 being the lowest quality,
-100 being the best. Default value is 75).
+which will convert the input file to a WebP file using a quality factor of 80
+on a 0->100 scale (0 being the lowest quality, 100 being the best. Default
+value is 75).
+You might want to try the -lossless flag too, which will compress the source
+(in RGBA format) without any loss. The -q quality parameter will in this case
+control the amount of processing time spent trying to make the output file as
+small as possible.

 A longer list of options is available using the -longhelp command line flag:

@ -94,41 +140,71 @@ A longer list of options is available using the -longhelp command line flag:
 Usage:
 cwebp [-preset <...>] [options] in_file [-o out_file]

-If input size (-s) for an image is not specified, it is assumed to be a either
-  PNG or JPEG format file.
-options:
+If input size (-s) for an image is not specified, it is
+assumed to be a PNG, JPEG, TIFF or WebP file.
+
+Options:
  -h / -help  ............ short help
  -H / -longhelp  ........ long help
  -q <float> ............. quality factor (0:small..100:big)
-  -preset <string> ....... Preset setting, one of:
+  -alpha_q <int> ......... transparency-compression quality (0..100)
+  -preset <string> ....... preset setting, one of:
                            default, photo, picture,
                            drawing, icon, text
-     -preset must come first, as it overwrites other parameters.
+     -preset must come first, as it overwrites other parameters
+
  -m <int> ............... compression method (0=fast, 6=slowest)
  -segments <int> ........ number of segments to use (1..4)
+  -size <int> ............ target size (in bytes)
+  -psnr <float> .......... target PSNR (in dB. typically: 42)

-  -s <int> <int> ......... Input size (width x height) for YUV
-  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)
+  -s <int> <int> ......... input size (width x height) for YUV
+  -sns <int> ............. spatial noise shaping (0:off, 100:max)
  -f <int> ............... filter strength (0=off..100)
  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
-  -strong ................ use strong filter instead of simple.
+  -strong ................ use strong filter instead of simple (default)
+  -nostrong .............. use simple filter instead of strong
+  -partition_limit <int> . limit quality to fit the 512k limit on
+                           the first partition (0=no degradation ... 100=full)
  -pass <int> ............ analysis pass number (1..10)
-  -partitions <int> ...... number of partitions to use (0..3)
  -crop <x> <y> <w> <h> .. crop picture with the given rectangle
-  -map <int> ............. print map of extra info.
-  -d <file.pgm> .......... dump the compressed output (PGM file).
+  -resize <w> <h> ........ resize picture (after any cropping)
+  -mt .................... use multi-threading if available
+  -low_memory ............ reduce memory usage (slower encoding)
+  -map <int> ............. print map of extra info
+  -print_psnr ............ prints averaged PSNR distortion
+  -print_ssim ............ prints averaged SSIM distortion
+  -print_lsim ............ prints local-similarity distortion
+  -d <file.pgm> .......... dump the compressed output (PGM file)
+  -alpha_method <int> .... transparency-compression method (0..1)
+  -alpha_filter <string> . predictive filtering for alpha plane,
+                           one of: none, fast (default) or best
+  -alpha_cleanup ......... clean RGB values in transparent area
+  -blend_alpha <hex> ..... blend colors against background color
+                           expressed as RGB values written in
+                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
+                           green=0xe0 and blue=0xd0
+  -noalpha ............... discard any transparency information
+  -lossless .............. encode image losslessly
+  -hint <string> ......... specify image characteristics hint,
+                           one of: photo, picture or graph
+
+  -metadata <string> ..... comma separated list of metadata to
+                           copy from the input to the output if present.
+                           Valid values: all, none (default), exif, icc, xmp

  -short ................. condense printed message
-  -quiet ................. don't print anything.
+  -quiet ................. don't print anything
+  -version ............... print version number and exit
+  -noasm ................. disable all assembly optimizations
  -v ..................... verbose, e.g. print encoding/decoding times
+  -progress .............. report encoding progress

 Experimental Options:
-  -size <int> ............ Target size (in bytes)
-  -psnr <float> .......... Target PSNR (in dB. typically: 42)
-  -af <int> .............. adjust filter strength (0=off, 1=on)
+  -jpeg_like ............. roughly match expected JPEG size
+  -af .................... auto-adjust filter strength
  -pre <int> ............. pre-processing filter

-
 The main options you might want to try in order to further tune the
 visual quality are:
 -preset
@ -137,7 +213,7 @@ visual quality are:
 -m

 Namely:
-  * 'preset' will set up a default encoding configuration targetting a
+  * 'preset' will set up a default encoding configuration targeting a
     particular type of input. It should appear first in the list of options,
     so that subsequent options can take effect on top of this preset.
     Default value is 'default'.
@ -149,10 +225,11 @@ Namely:
     but with better quality.
     Typical value is around '75'.
  * 'f' option directly links to the filtering strength used by the codec's
-     in-loop processing. The higher, the smoother will highly-compressed area
-     look. This is particularly useful when aiming at very small files.
-     Typical values are around 20-30. Note that using the option -strong will
-     change the type of filtering. Use "-f 0" to turn filtering off.
+     in-loop processing. The higher the value, the smoother the
+     highly-compressed area will look. This is particularly useful when aiming
+     at very small files. Typical values are around 20-30. Note that using the
+     option -strong/-nostrong will change the type of filtering. Use "-f 0" to
+     turn filtering off.
  * 'm' controls the trade-off between encoding speed and quality. Default is 4.
     You can try -m 5 or -m 6 to explore more (time-consuming) encoding
     possibilities. A lower value will result in faster encoding at the expense
@ -161,7 +238,7 @@ Namely:
 Decoding tool:
 ==============

-There is a decoding sample code as examples/dwebp.c which will take
+There is a decoding sample in examples/dwebp.c which will take
 a .webp file and decode it to a PNG image file (amongst other formats).
 This is simply to demonstrate the use of the API. You can verify the
 file test.webp decodes to exactly the same as test_ref.ppm by using:
@ -170,9 +247,127 @@ file test.webp decodes to exactly the same as test_ref.ppm by using:
 ./dwebp test.webp -ppm -o test.ppm
 diff test.ppm test_ref.ppm

+The full list of options is available using -h:
+
+> dwebp -h
+Usage: dwebp in_file [options] [-o out_file]
+
+Decodes the WebP image file to PNG format [Default]
+Use following options to convert into alternate image formats:
+  -pam ......... save the raw RGBA samples as a color PAM
+  -ppm ......... save the raw RGB samples as a color PPM
+  -bmp ......... save as uncompressed BMP format
+  -tiff ........ save as uncompressed TIFF format
+  -pgm ......... save the raw YUV samples as a grayscale PGM
+                 file with IMC4 layout
+  -yuv ......... save the raw YUV samples in flat layout
+
+ Other options are:
+  -version  .... print version number and exit
+  -nofancy ..... don't use the fancy YUV420 upscaler
+  -nofilter .... disable in-loop filtering
+  -nodither .... disable dithering
+  -dither <d> .. dithering strength (in 0..100)
+  -mt .......... use multi-threading
+  -crop <x> <y> <w> <h> ... crop output with the given rectangle
+  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -alpha ....... only save the alpha plane
+  -incremental . use incremental decoding (useful for tests)
+  -h     ....... this help message
+  -v     ....... verbose (e.g. print encoding/decoding times)
+  -noasm ....... disable all assembly optimizations
+
+Visualization tool:
+===================
+
+There's a little self-serve visualization tool called 'vwebp' under the
+examples/ directory. It uses OpenGL to open a simple drawing window and show
+a decoded WebP file. It's not yet integrated in the automake build system, but
+you can try to manually compile it using the recommendations below.
+
+Usage: vwebp in_file [options]
+
+Decodes the WebP image file and visualize it using OpenGL
+Options are:
+  -version  .... print version number and exit
+  -noicc ....... don't use the icc profile if present
+  -nofancy ..... don't use the fancy YUV420 upscaler
+  -nofilter .... disable in-loop filtering
+  -dither <int>  dithering strength (0..100), default=50
+  -mt .......... use multi-threading
+  -info ........ print info
+  -h     ....... this help message
+
+Keyboard shortcuts:
+  'c' ................ toggle use of color profile
+  'i' ................ overlay file information
+  'q' / 'Q' / ESC .... quit
+
+Building:
+---------
+
+Prerequisites:
+1) OpenGL & OpenGL Utility Toolkit (GLUT)
+  Linux:
+    $ sudo apt-get install freeglut3-dev mesa-common-dev
+  Mac + XCode:
+    - These libraries should be available in the OpenGL / GLUT frameworks.
+  Windows:
+    http://freeglut.sourceforge.net/index.php#download
+
+2) (Optional) qcms (Quick Color Management System)
+  i. Download qcms from Mozilla / Chromium:
+    http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+    http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+  ii. Build and archive the source files as libqcms.a / qcms.lib
+  iii. Update makefile.unix / Makefile.vc
+    a) Define WEBP_HAVE_QCMS
+    b) Update include / library paths to reference the qcms directory.
+
+Build using makefile.unix / Makefile.vc:
+$ make -f makefile.unix examples/vwebp
+> nmake /f Makefile.vc CFG=release-static \
+    ../obj/x64/release-static/bin/vwebp.exe
+
+Animated GIF conversion:
+========================
+Animated GIF files can be converted to WebP files with animation using the
+gif2webp utility available under examples/. The files can then be viewed using
+vwebp.
+
+Usage:
+ gif2webp [options] gif_file -o webp_file
+Options:
+  -h / -help  ............ this help
+  -lossy ................. encode image using lossy compression
+  -mixed ................. for each frame in the image, pick lossy
+                           or lossless compression heuristically
+  -q <float> ............. quality factor (0:small..100:big)
+  -m <int> ............... compression method (0=fast, 6=slowest)
+  -kmin <int> ............ min distance between key frames
+  -kmax <int> ............ max distance between key frames
+  -f <int> ............... filter strength (0=off..100)
+  -metadata <string> ..... comma separated list of metadata to
+                           copy from the input to the output if present
+                           Valid values: all, none, icc, xmp (default)
+  -mt .................... use multi-threading if available
+
+  -version ............... print version number and exit
+  -v ..................... verbose
+  -quiet ................. don't print anything
+
+Building:
+---------
+With the libgif development files installed, gif2webp can be built using
+makefile.unix:
+$ make -f makefile.unix examples/gif2webp
+
+or using autoconf:
+$ ./configure --enable-everything
+$ make

 Encoding API:
-===========
+=============

 The main encoding functions are available in the header src/webp/encode.h
 The ready-to-use ones are:
@ -188,10 +383,26 @@ size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
 They will convert raw RGB samples to a WebP data. The only control supplied
 is the quality factor.

+There are some variants for using the lossless format:
+
+size_t WebPEncodeLosslessRGB(const uint8_t* rgb, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGR(const uint8_t* bgr, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessRGBA(const uint8_t* rgba, int width, int height,
+                              int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGRA(const uint8_t* bgra, int width, int height,
+                              int stride, uint8_t** output);
+
+Of course in this case, no quality factor is needed since the compression
+occurs without loss of the input values, at the expense of larger output sizes.
+
+Advanced encoding API:
+----------------------

 A more advanced API is based on the WebPConfig and WebPPicture structures.

-WebPConfig contains the encoding settings and is not tied a to a particular
+WebPConfig contains the encoding settings and is not tied to a particular
 picture.
 WebPPicture contains input data, on which some WebPConfig will be used for
 compression.
@ -210,7 +421,7 @@ The encoding flow looks like:
  // ... additional tuning
  config.sns_strength = 90;
  config.filter_sharpness = 6;
-  config_error = WebPValidateConfig(&config);  // not mandartory, but useful
+  config_error = WebPValidateConfig(&config);  // not mandatory, but useful

  // Setup the input data
  WebPPicture pic;
@ -223,50 +434,51 @@ The encoding flow looks like:
  if (!WebPPictureAllocate(&pic)) {
    return 0;   // memory error
  }
-  // add that point, 'pic' has been initialized as a container,
+  // at this point, 'pic' has been initialized as a container,
  // and can receive the Y/U/V samples.
  // Alternatively, one could use ready-made import functions like
  // WebPPictureImportRGB(), which will take care of memory allocation.
  // In any case, past this point, one will have to call
  // WebPPictureFree(&pic) to reclaim memory.

-
  // Set up a byte-output write method. WebPMemoryWriter, for instance.
  WebPMemoryWriter wrt;
+  WebPMemoryWriterInit(&wrt);     // initialize 'wrt'
+
  pic.writer = MyFileWriter;
  pic.custom_ptr = my_opaque_structure_to_make_MyFileWriter_work;
-  // initialize 'wrt' here...

  // Compress!
-  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occured!
+  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
  WebPPictureFree(&pic);  // must be called independently of the 'ok' result.

  // output data should have been handled by the writer at that point.
+  // -> compressed data is the memory buffer described by wrt.mem / wrt.size
+
+  // deallocate the memory used by compressed data
+  WebPMemoryWriterClear(&wrt);

 -------------------------------------- END PSEUDO EXAMPLE

-
 Decoding API:
 =============

 This is mainly just one function to call:

 #include "webp/decode.h"
-uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                       int* width, int* height);

 Please have a look at the file src/webp/decode.h for the details.
-There are variants for decoding in BGR/RGBA/BGRA order, along with decoding to
-raw Y'CbCr samples. One can also decode the image directly into a pre-allocated
-buffer.
+There are variants for decoding in BGR/RGBA/ARGB/BGRA order, along with
+decoding to raw Y'CbCr samples. One can also decode the image directly into a
+pre-allocated buffer.

-To detect a WebP file and gather picture's dimensions, the function:
-  int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                  int *width, int *height);
+To detect a WebP file and gather the picture's dimensions, the function:
+  int WebPGetInfo(const uint8_t* data, size_t data_size,
+                  int* width, int* height);
 is supplied. No decoding is involved when using it.

-A lower-level API is available from the header file <webp/decode_vp8.h>
-
 Incremental decoding API:
 =========================

@ -276,7 +488,11 @@ is stored into an instance of the WebPIDecoder object. This object can be
 created with the purpose of decoding either RGB or Y'CbCr samples.
 For instance:

-  WebPIDecoder* idec = WebPINew(MODE_BGR);
+  WebPDecBuffer buffer;
+  WebPInitDecBuffer(&buffer);
+  buffer.colorspace = MODE_BGR;
+  ...
+  WebPIDecoder* idec = WebPINewDecoder(&buffer);

 As data is made progressively available, this incremental-decoder object
 can be used to decode the picture further. There are two (mutually exclusive)
@ -290,26 +506,92 @@ or by just mentioning the new size of the transmitted data:

  WebPIUpdate(idec, buffer, size_of_transmitted_buffer);

-Note that 'buffer' can be modified between each calls to WebPIUpdate, in
-particular when the buffer is resized to accomodate larger data.
+Note that 'buffer' can be modified between each call to WebPIUpdate, in
+particular when the buffer is resized to accommodate larger data.

 These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
-decoding is not finished yet, or VP8_STATUS_OK when decoding is done.
-Any other status is an error condition.
+decoding is not finished yet or VP8_STATUS_OK when decoding is done. Any other
+status is an error condition.

-The idec object must always be released (even upon an error condition)
-by calling: WebPDelete(idec)
+The 'idec' object must always be released (even upon an error condition) by
+calling: WebPDelete(idec).

 To retrieve partially decoded picture samples, one must use the corresponding
-method: WebPIDecGetRGB or WebPIDecGetYUV.
+method: WebPIDecGetRGB or WebPIDecGetYUVA.
 It will return the last displayable pixel row.

 Lastly, note that decoding can also be performed into a pre-allocated pixel
 buffer. This buffer must be passed when creating a WebPIDecoder, calling
-WebPINewRGB() or WebPINewYUV().
+WebPINewRGB() or WebPINewYUVA().

 Please have a look at the src/webp/decode.h header for further details.

+Advanced Decoding API:
+======================
+
+WebP decoding supports an advanced API which provides on-the-fly cropping and
+rescaling, something of great usefulness on memory-constrained environments like
+mobile phones. Basically, the memory usage will scale with the output's size,
+not the input's, when one only needs a quick preview or a zoomed in portion of
+an otherwise too-large picture. Some CPU can be saved too, incidentally.
+
+-------------------------------------- BEGIN PSEUDO EXAMPLE
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config' options, if needed
+     config.options.no_fancy_upsampling = 1;
+     config.options.use_scaling = 1;
+     config.options.scaled_width = scaledWidth();
+     config.options.scaled_height = scaledHeight();
+     // etc.
+
+     // D) Specify 'config' output options for specifying output colorspace.
+     // Optionally the external image decode buffer can also be specified.
+     config.output.colorspace = MODE_BGRA;
+     // Optionally, the config.output can be pointed to an external buffer as
+     // well for decoding the image. This externally supplied memory buffer
+     // should be big enough to store the decoded picture.
+     config.output.u.RGBA.rgba = (uint8_t*) memory_buffer;
+     config.output.u.RGBA.stride = scanline_stride;
+     config.output.u.RGBA.size = total_size_of_the_memory_buffer;
+     config.output.is_external_memory = 1;
+
+     // E) Decode the WebP image. There are two variants w.r.t decoding image.
+     // The first one (E.1) decodes the full image and the second one (E.2) is
+     // used to incrementally decode the image using small input buffers.
+     // Any one of these steps can be used to decode the WebP image.
+
+     // E.1) Decode full image.
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E.2) Decode image incrementally.
+     WebPIDecoder* const idec = WebPIDecode(NULL, NULL, &config);
+     CHECK(idec != NULL);
+     while (bytes_remaining > 0) {
+       VP8StatusCode status = WebPIAppend(idec, input, bytes_read);
+       if (status == VP8_STATUS_OK || status == VP8_STATUS_SUSPENDED) {
+         bytes_remaining -= bytes_read;
+       } else {
+         break;
+       }
+     }
+     WebPIDelete(idec);
+
+     // F) Decoded image is now in config.output (and config.output.u.RGBA).
+     // It can be saved, displayed or otherwise processed.
+
+     // G) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+
+-------------------------------------- END PSEUDO EXAMPLE
+
 Bugs:
 =====

@ -322,3 +604,4 @@ Discuss:
 ========

 Email: webp-discuss@webmproject.org
+Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/README.mux
+++ b/README.mux
@ -0,0 +1,186 @@
+          __   __  ____  ____  ____  __ __  _     __ __
+         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
+         \       /   __/  _  \   __/      /  /  (_/  /__
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.2
+
+
+Description:
+============
+
+WebPMux: set of two libraries 'Mux' and 'Demux' for creation, extraction and
+manipulation of an extended format WebP file, which can have features like
+color profile, metadata and animation. Reference command-line tools 'webpmux'
+and 'vwebp' as well as the WebP container specification
+'doc/webp-container-spec.txt' are also provided in this package.
+
+WebP Mux tool:
+==============
+
+The examples/ directory contains a tool (webpmux) for manipulating WebP
+files. The webpmux tool can be used to create an extended format WebP file and
+also to extract or strip relevant data from such a file.
+
+A list of options is available using the -help command line flag:
+
+> webpmux -help
+Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
+       webpmux -set SET_OPTIONS INPUT -o OUTPUT
+       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT
+       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]
+               [-bgcolor BACKGROUND_COLOR] -o OUTPUT
+       webpmux -info INPUT
+       webpmux [-h|-help]
+       webpmux -version
+
+GET_OPTIONS:
+ Extract relevant data:
+   icc       get ICC profile
+   exif      get EXIF metadata
+   xmp       get XMP metadata
+   frame n   get nth frame
+
+SET_OPTIONS:
+ Set color profile/metadata:
+   icc  file.icc     set ICC profile
+   exif file.exif    set EXIF metadata
+   xmp  file.xmp     set XMP metadata
+   where:    'file.icc' contains the ICC profile to be set,
+             'file.exif' contains the EXIF metadata to be set
+             'file.xmp' contains the XMP metadata to be set
+
+STRIP_OPTIONS:
+ Strip color profile/metadata:
+   icc       strip ICC profile
+   exif      strip EXIF metadata
+   xmp       strip XMP metadata
+
+FRAME_OPTIONS(i):
+ Create animation:
+   file_i +di+[xi+yi[+mi[bi]]]
+   where:    'file_i' is the i'th animation frame (WebP format),
+             'di' is the pause duration before next frame,
+             'xi','yi' specify the image offset for this frame,
+             'mi' is the dispose method for this frame (0 or 1),
+             'bi' is the blending method for this frame (+b or -b)
+
+LOOP_COUNT:
+ Number of times to repeat the animation.
+ Valid range is 0 to 65535 [Default: 0 (infinite)].
+
+BACKGROUND_COLOR:
+ Background color of the canvas.
+  A,R,G,B
+  where:    'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying
+            the Alpha, Red, Green and Blue component values respectively
+            [Default: 255,255,255,255]
+
+INPUT & OUTPUT are in WebP format.
+
+Note: The nature of EXIF, XMP and ICC data is not checked and is assumed to be
+valid.
+
+Visualization tool:
+===================
+
+The examples/ directory also contains a tool (vwebp) for viewing WebP files.
+It decodes the image and visualizes it using OpenGL. See the libwebp README
+for details on building and running this program.
+
+Mux API:
+========
+The Mux API contains methods for adding data to and reading data from WebP
+files. This API currently supports XMP/EXIF metadata, ICC profile and animation.
+Other features may be added in subsequent releases.
+
+Example#1 (pseudo code): Creating a WebPMux object with image data, color
+profile and XMP metadata.
+
+  int copy_data = 0;
+  WebPMux* mux = WebPMuxNew();
+  // ... (Prepare image data).
+  WebPMuxSetImage(mux, &image, copy_data);
+  // ... (Prepare ICC profile data).
+  WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
+  // ... (Prepare XMP metadata).
+  WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
+  // Get data from mux in WebP RIFF format.
+  WebPMuxAssemble(mux, &output_data);
+  WebPMuxDelete(mux);
+  // ... (Consume output_data; e.g. write output_data.bytes to file).
+  WebPDataClear(&output_data);
+
+
+Example#2 (pseudo code): Get image and color profile data from a WebP file.
+
+  int copy_data = 0;
+  // ... (Read data from file).
+  WebPMux* mux = WebPMuxCreate(&data, copy_data);
+  WebPMuxGetFrame(mux, 1, &image);
+  // ... (Consume image; e.g. call WebPDecode() to decode the data).
+  WebPMuxGetChunk(mux, "ICCP", &icc_profile);
+  // ... (Consume icc_profile).
+  WebPMuxDelete(mux);
+  free(data);
+
+
+For a detailed Mux API reference, please refer to the header file
+(src/webp/mux.h).
+
+Demux API:
+==========
+The Demux API enables extraction of images and extended format data from
+WebP files. This API currently supports reading of XMP/EXIF metadata, ICC
+profile and animated images. Other features may be added in subsequent
+releases.
+
+Code Example: Demuxing WebP data to extract all the frames, ICC profile
+and EXIF/XMP metadata.
+
+  WebPDemuxer* demux = WebPDemux(&webp_data);
+  uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
+  uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
+  // ... (Get information about the features present in the WebP file).
+  uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
+
+  // ... (Iterate over all frames).
+  WebPIterator iter;
+  if (WebPDemuxGetFrame(demux, 1, &iter)) {
+    do {
+      // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
+      // ... and get other frame properties like width, height, offsets etc.
+      // ... see 'struct WebPIterator' below for more info).
+    } while (WebPDemuxNextFrame(&iter));
+    WebPDemuxReleaseIterator(&iter);
+  }
+
+  // ... (Extract metadata).
+  WebPChunkIterator chunk_iter;
+  if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
+  // ... (Consume the ICC profile in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
+  // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
+  // ... (Consume the XMP metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  WebPDemuxDelete(demux);
+
+
+For a detailed Demux API reference, please refer to the header file
+(src/webp/demux.h).
+
+
+Bugs:
+=====
+
+Please report all bugs to our issue tracker:
+    http://code.google.com/p/webp/issues
+Patches welcome! See this page to get started:
+    http://www.webmproject.org/code/contribute/submitting-patches/
+
+Discuss:
+========
+
+Email: webp-discuss@webmproject.org
+Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/configure.ac
+++ b/configure.ac
@ -1,73 +1,573 @@
-AC_INIT([webpdecode], [0.1])
+AC_INIT([libwebp], [0.4.2],
+        [http://code.google.com/p/webp/issues],,
+        [http://developers.google.com/speed/webp])
+AC_CANONICAL_HOST
+AC_PREREQ([2.60])
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
+
+dnl === automake >= 1.12 requires this for 'unusual archivers' support.
+dnl === it must occur before LT_INIT (AC_PROG_LIBTOOL).
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+
 AC_PROG_LIBTOOL
 AM_PROG_CC_C_O

-AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=PATH],
-	[Path to the pkgconfig directory [[LIBDIR/pkgconfig]]]),
-	[pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
+dnl === Enable less verbose output when building.
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+dnl == test endianness
+AC_C_BIGENDIAN
+
+dnl === SET_IF_UNSET(shell_var, value)
+dnl ===   Set the shell variable 'shell_var' to 'value' if it is unset.
+AC_DEFUN([SET_IF_UNSET], [test "${$1+set}" = "set" || $1=$2])
+
+AC_ARG_ENABLE([everything],
+              AS_HELP_STRING([--enable-everything],
+                             [Enable all optional targets. These can still be
+                              disabled with --disable-target]),
+              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
+               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
+               SET_IF_UNSET([enable_libwebpmux], [$enableval])])
+
+AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=DIR],
+            [Path to the pkgconfig directory @<:@LIBDIR/pkgconfig@:>@]),
+            [pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
 AC_SUBST([pkgconfigdir])

+dnl === TEST_AND_ADD_CFLAGS(var, flag)
+dnl ===   Checks whether $CC supports 'flag' and adds it to 'var'
+dnl ===   on success.
+AC_DEFUN([TEST_AND_ADD_CFLAGS],
+         [SAVED_CFLAGS="$CFLAGS"
+          CFLAGS="-Werror $2"
+          AC_MSG_CHECKING([whether $CC supports $2])
+          dnl Note AC_LANG_PROGRAM([]) uses an old-style main definition.
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(void) { return 0; }])],
+                            [AC_MSG_RESULT([yes])]
+                            dnl Simply append the variable avoiding a
+                            dnl compatibility ifdef for AS_VAR_APPEND as this
+                            dnl variable shouldn't grow all that large.
+                            [$1="${$1} $2"],
+                            [AC_MSG_RESULT([no])])
+          CFLAGS="$SAVED_CFLAGS"])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-security])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wvla])
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62040
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61622
+AS_IF([test "$GCC" = "yes" ], [
+       gcc_version=`$CC -dumpversion`
+       gcc_wht_bug=""
+       case "$host_cpu" in
+         aarch64|arm64)
+          case "$gcc_version" in
+            4.9|4.9.0|4.9.1) gcc_wht_bug=yes ;;
+          esac
+       esac
+       AS_IF([test "$gcc_wht_bug" = "yes"], [
+              TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-frename-registers])])])
+AC_SUBST([AM_CFLAGS])
+
+dnl === Check for machine specific flags
+TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
+AS_IF([test -n "$AVX2_FLAGS"], [
+  SAVED_CFLAGS=$CFLAGS
+  CFLAGS="$CFLAGS $AVX2_FLAGS"
+  AC_CHECK_HEADER([immintrin.h],
+                  [AC_DEFINE(WEBP_HAVE_AVX2, [1],
+                   [Set to 1 if AVX2 is supported])],
+                  [AVX2_FLAGS=""],
+                  dnl it's illegal to directly include avx2intrin.h, but it's
+                  dnl included conditionally in immintrin.h, tricky!
+                  [#ifndef __AVX2__
+                   #error avx2 is not enabled
+                   #endif
+                  ])
+  CFLAGS=$SAVED_CFLAGS])
+AC_SUBST([AVX2_FLAGS])
+
+TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2])
+AS_IF([test -n "$SSE2_FLAGS"], [
+  SAVED_CFLAGS=$CFLAGS
+  CFLAGS="$CFLAGS $SSE2_FLAGS"
+  AC_CHECK_HEADER([emmintrin.h],
+                  [AC_DEFINE(WEBP_HAVE_SSE2, [1],
+                   [Set to 1 if SSE2 is supported])],
+                  [SSE2_FLAGS=""])
+  CFLAGS=$SAVED_CFLAGS])
+AC_SUBST([SSE2_FLAGS])
+
+dnl === CLEAR_LIBVARS([var_pfx])
+dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([CLEAR_LIBVARS], [$1_INCLUDES=""; $1_LIBS=""])
+
+dnl === WITHLIB_OPTION([opt_pfx], [outvar_pfx])
+dnl ===   Defines --with-<opt_pfx>{include,lib}dir options which set
+dnl ===   the variables <outvar_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([WITHLIB_OPTION],
+  [AC_ARG_WITH([$1includedir],
+               AS_HELP_STRING([--with-$1includedir=DIR],
+                              [use $2 includes from DIR]),
+               $2_INCLUDES="-I$withval")
+   AC_ARG_WITH([$1libdir],
+               AS_HELP_STRING([--with-$1libdir=DIR],
+                              [use $2 libraries from DIR]),
+               [$2_LIBS="-L$withval"])])
+
+dnl === LIBCHECK_PROLOGUE([var_pfx])
+dnl ===   Caches the current values of CPPFLAGS/LIBS in SAVED_* then
+dnl ===   prepends the current values with <var_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([LIBCHECK_PROLOGUE],
+         [SAVED_CPPFLAGS=$CPPFLAGS
+          SAVED_LIBS=$LIBS
+          CPPFLAGS="$$1_INCLUDES $CPPFLAGS"
+          LIBS="$$1_LIBS $LIBS"])
+
+dnl === LIBCHECK_EPILOGUE([var_pfx])
+dnl ===   Restores the values of CPPFLAGS/LIBS from SAVED_* and exports
+dnl ===   <var_pfx>_{INCLUDES,LIBS} with AC_SUBST.
+AC_DEFUN([LIBCHECK_EPILOGUE],
+         [AC_SUBST($1_LIBS)
+          AC_SUBST($1_INCLUDES)
+          CPPFLAGS=$SAVED_CPPFLAGS
+          LIBS=$SAVED_LIBS])
+
+dnl === Check for gcc builtins
+
+dnl === CHECK_FOR_BUILTIN([builtin], [param], [define])
+dnl ===   links a C AC_LANG_PROGRAM, with <builtin>(<param>)
+dnl ===   AC_DEFINE'ing <define> if successful.
+AC_DEFUN([CHECK_FOR_BUILTIN],
+         [AC_LANG_PUSH([C])
+          AC_MSG_CHECKING([for $1])
+          AC_LINK_IFELSE([AC_LANG_PROGRAM([], [$1($2)])],
+                         [AC_MSG_RESULT([yes])
+                          AC_DEFINE([$3], [1],
+                                    [Set to 1 if $1 is available])],
+                         [AC_MSG_RESULT([no])]),
+          AC_LANG_POP])
+
+dnl AC_CHECK_FUNC doesn't work with builtin's.
+CHECK_FOR_BUILTIN([__builtin_bswap16], [1u << 15], [HAVE_BUILTIN_BSWAP16])
+CHECK_FOR_BUILTIN([__builtin_bswap32], [1u << 31], [HAVE_BUILTIN_BSWAP32])
+CHECK_FOR_BUILTIN([__builtin_bswap64], [1ull << 63], [HAVE_BUILTIN_BSWAP64])
+
+dnl === Check for pthread support
+AC_ARG_ENABLE([threading],
+              AS_HELP_STRING([--disable-threading],
+                             [Disable detection of thread support]),,
+              [enable_threading=yes])
+if test "$enable_threading" = "yes"; then
+  AC_MSG_NOTICE([checking for threading support...])
+  AX_PTHREAD([AC_DEFINE([WEBP_USE_THREAD], [1],
+                        [Undefine this to disable thread support.])
+              LIBS="$PTHREAD_LIBS $LIBS"
+              CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+              CC="$PTHREAD_CC"
+             ],
+             [AC_CHECK_FUNC([_beginthreadex],
+                            [AC_DEFINE([WEBP_USE_THREAD], [1],
+                                       [Undefine this to disable thread
+                                        support.])],
+                            [enable_threading=no])])
+fi
+AC_MSG_NOTICE([checking if threading is enabled... ${enable_threading-no}])
+
+dnl === check for OpenGL/GLUT support ===
+
+AC_ARG_ENABLE([gl], AS_HELP_STRING([--disable-gl],
+                                   [Disable detection of OpenGL support
+                                    @<:@default=auto@:>@]))
+AS_IF([test "x$enable_gl" != "xno"], [
+  CLEAR_LIBVARS([GL])
+  WITHLIB_OPTION([gl], [GL])
+
+  LIBCHECK_PROLOGUE([GL])
+
+  glut_cflags="none"
+  glut_ldflags="none"
+  case $host_os in
+    darwin*)
+      # Special case for OSX builds. Append these to give the user a chance to
+      # override with --with-gl*
+      glut_cflags="$glut_cflags|-framework GLUT -framework OpenGL"
+      glut_ldflags="$glut_ldflags|-framework GLUT -framework OpenGL"
+      ;;
+  esac
+
+  GLUT_SAVED_CPPFLAGS="$CPPFLAGS"
+  SAVED_IFS="$IFS"
+  IFS="|"
+  for flag in $glut_cflags; do
+    # restore IFS immediately as the autoconf macros may need the default.
+    IFS="$SAVED_IFS"
+    unset ac_cv_header_GL_glut_h
+    unset ac_cv_header_OpenGL_glut_h
+
+    case $flag in
+      none) ;;
+      *) CPPFLAGS="$flag $CPPFLAGS";;
+    esac
+    AC_CHECK_HEADERS([GL/glut.h GLUT/glut.h OpenGL/glut.h],
+                     [glut_headers=yes;
+                      test "$flag" = "none" || GL_INCLUDES="$CPPFLAGS";
+                      break])
+    CPPFLAGS="$GLUT_SAVED_CPPFLAGS"
+    test "$glut_headers" = "yes" && break
+  done
+  IFS="$SAVED_IFS"
+
+  if test "$glut_headers" = "yes"; then
+    AC_LANG_PUSH([C])
+    GLUT_SAVED_LDFLAGS="$LDFLAGS"
+    SAVED_IFS="$IFS"
+    IFS="|"
+    for flag in $glut_ldflags; do
+      # restore IFS immediately as the autoconf macros may need the default.
+      IFS="$SAVED_IFS"
+      unset ac_cv_search_glBegin
+
+      case $flag in
+        none) ;;
+        *) LDFLAGS="$flag $LDFLAGS";;
+      esac
+
+      # find libGL
+      GL_SAVED_LIBS="$LIBS"
+      AC_SEARCH_LIBS([glBegin], [GL OpenGL opengl32])
+      LIBS="$GL_SAVED_LIBS"
+
+      # A direct link to libGL may not be necessary on e.g., linux.
+      GLUT_SAVED_LIBS="$LIBS"
+      for lib in "" "-lglut" "-lglut $ac_cv_search_glBegin"; do
+        LIBS="$lib"
+        AC_LINK_IFELSE(
+          [AC_LANG_PROGRAM([
+             #ifdef __cplusplus
+             # define EXTERN_C extern "C"
+             #else
+             # define EXTERN_C
+             #endif
+             EXTERN_C char glOrtho();
+             EXTERN_C char glutMainLoop();
+            ],[
+             glOrtho();
+             glutMainLoop();
+            ])
+          ],
+          AC_DEFINE(WEBP_HAVE_GL, [1],
+                    [Set to 1 if OpenGL is supported])
+          [glut_support=yes], []
+        )
+        if test "$glut_support" = "yes"; then
+          GL_LIBS="$LDFLAGS $lib"
+          break
+        fi
+      done
+      LIBS="$GLUT_SAVED_LIBS"
+      LDFLAGS="$GLUT_SAVED_LDFLAGS"
+      test "$glut_support" = "yes" && break
+    done
+    IFS="$SAVED_IFS"
+    AC_LANG_POP
+  fi
+
+  LIBCHECK_EPILOGUE([GL])
+
+  if test "$glut_support" = "yes" -a "$enable_libwebpdemux" = "yes"; then
+    build_vwebp=yes
+  fi
+])
+AM_CONDITIONAL([BUILD_VWEBP], [test "$build_vwebp" = "yes"])
+
 dnl === check for PNG support ===

-PNG_INCLUDES=""
-PNG_LIBS=""
-AC_PATH_PROG(LIBPNG_CONFIG, libpng-config)
-if test -n "$LIBPNG_CONFIG"; then
-  PNG_INCLUDES=`$LIBPNG_CONFIG --cflags`
-  PNG_PREFIX=`$LIBPNG_CONFIG --prefix`
-  if test "${PNG_PREFIX}/lib" != "/usr/lib" ; then
-    PNG_LIBS="-L${PNG_PREFIX}/lib"
+AC_ARG_ENABLE([png], AS_HELP_STRING([--disable-png],
+                                    [Disable detection of PNG format support
+                                     @<:@default=auto@:>@]))
+AS_IF([test "x$enable_png" != "xno"], [
+  CLEAR_LIBVARS([PNG])
+  AC_PATH_PROGS([LIBPNG_CONFIG],
+                [libpng-config libpng16-config libpng15-config libpng14-config \
+                 libpng12-config])
+  if test -n "$LIBPNG_CONFIG"; then
+    PNG_INCLUDES=`$LIBPNG_CONFIG --cflags`
+    PNG_LIBS="`$LIBPNG_CONFIG --ldflags`"
  fi
-fi

-AC_ARG_WITH(pngincludedir,
-            [--with-pngincludedir=DIR use PNG includes from DIR],
-            PNG_INCLUDES="-I$withval")
-AC_ARG_WITH(pnglibdir,
-            [--with-pnglibdir=DIR    use PNG libraries from DIR],
-            [PNG_LIBS="-L$withval"])
+  WITHLIB_OPTION([png], [PNG])

-AC_CHECK_HEADER(png.h,
-  AC_CHECK_LIB(png, main,
-               [PNG_LIBS="$PNG_LIBS -lpng"
-                PNG_INCLUDES="$PNG_INCLUDES -DWEBP_HAVE_PNG"
-                AC_DEFINE(WEBP_HAVE_PNG, [1], [Set to 1 if PNG library is installed])
-               ],
-               AC_MSG_WARN(Optional png library not found),
-               [$MATH_LIBS]),
-  AC_MSG_WARN(png library not available - no png.h)
-)
-AC_SUBST(PNG_LIBS)
-AC_SUBST(PNG_INCLUDES)
+  LIBCHECK_PROLOGUE([PNG])
+  AC_CHECK_HEADER(png.h,
+    AC_SEARCH_LIBS(png_get_libpng_ver, [png],
+                   [test "$ac_cv_search_png_get_libpng_ver" = "none required" \
+                      || PNG_LIBS="$PNG_LIBS $ac_cv_search_png_get_libpng_ver"
+                    PNG_INCLUDES="$PNG_INCLUDES -DWEBP_HAVE_PNG"
+                    AC_DEFINE(WEBP_HAVE_PNG, [1],
+                              [Set to 1 if PNG library is installed])
+                    png_support=yes
+                   ],
+                   [AC_MSG_WARN(Optional png library not found)
+                    PNG_LIBS=""
+                    PNG_INCLUDES=""
+                   ],
+                   [$MATH_LIBS]),
+    [AC_MSG_WARN(png library not available - no png.h)
+     PNG_LIBS=""
+     PNG_INCLUDES=""
+    ],
+  )
+  LIBCHECK_EPILOGUE([PNG])
+])

 dnl === check for JPEG support ===

-JPEG_INCLUDES=""
-JPEG_LIBS=""
-AC_ARG_WITH(jpegincludedir,
-            [--with-jpegincludedir=DIR use JPEG includes from DIR],
-            JPEG_INCLUDES="-I$withval")
-AC_ARG_WITH(jpeglibdir,
-            [--with-jpeglibdir=DIR    use JPEG libraries from DIR],
-            [JPEG_LIBS="-L$withval"])
+AC_ARG_ENABLE([jpeg],
+              AS_HELP_STRING([--disable-jpeg],
+                             [Disable detection of JPEG format support
+                              @<:@default=auto@:>@]))
+AS_IF([test "x$enable_jpeg" != "xno"], [
+  CLEAR_LIBVARS([JPEG])
+  WITHLIB_OPTION([jpeg], [JPEG])

-AC_CHECK_HEADER(jpeglib.h,
-  AC_CHECK_LIB(jpeg, jpeg_set_defaults,
-               [JPEG_LIBS="$JPEG_LIBS -ljpeg"
-                JPEG_INCLUDES="$JPEG_INCLUDES -DWEBP_HAVE_JPEG"
-                AC_DEFINE(WEBP_HAVE_JPEG, [1], [Set to 1 if JPEG library is installed])
-               ],
-               AC_MSG_WARN(Optional jpeg library not found),
-               [$MATH_LIBS]),
-  AC_MSG_WARN(jpeg library not available - no jpeglib.h)
-)
-AC_SUBST(JPEG_LIBS)
-AC_SUBST(JPEG_INCLUDES)
+  LIBCHECK_PROLOGUE([JPEG])
+  AC_CHECK_HEADER(jpeglib.h,
+    AC_CHECK_LIB(jpeg, jpeg_set_defaults,
+                 [JPEG_LIBS="$JPEG_LIBS -ljpeg"
+                  JPEG_INCLUDES="$JPEG_INCLUDES -DWEBP_HAVE_JPEG"
+                  AC_DEFINE(WEBP_HAVE_JPEG, [1],
+                            [Set to 1 if JPEG library is installed])
+                  jpeg_support=yes
+                 ],
+                 AC_MSG_WARN(Optional jpeg library not found),
+                 [$MATH_LIBS]),
+    AC_MSG_WARN(jpeg library not available - no jpeglib.h)
+  )
+  LIBCHECK_EPILOGUE([JPEG])
+])
+
+dnl === check for TIFF support ===
+
+AC_ARG_ENABLE([tiff],
+              AS_HELP_STRING([--disable-tiff],
+                             [Disable detection of TIFF format support
+                              @<:@default=auto@:>@]))
+AS_IF([test "x$enable_tiff" != "xno"], [
+  CLEAR_LIBVARS([TIFF])
+  WITHLIB_OPTION([tiff], [TIFF])
+
+  LIBCHECK_PROLOGUE([TIFF])
+  AC_CHECK_HEADER(tiffio.h,
+    AC_CHECK_LIB(tiff, TIFFGetVersion,
+                 [TIFF_LIBS="$TIFF_LIBS -ltiff"
+                  TIFF_INCLUDES="$TIFF_INCLUDES -DWEBP_HAVE_TIFF"
+                  AC_DEFINE(WEBP_HAVE_TIFF, [1],
+                            [Set to 1 if TIFF library is installed])
+                  tiff_support=yes
+                 ],
+                 AC_MSG_WARN(Optional tiff library not found),
+                 [$MATH_LIBS]),
+    AC_MSG_WARN(tiff library not available - no tiffio.h)
+  )
+  LIBCHECK_EPILOGUE([TIFF])
+])
+
+dnl === check for GIF support ===
+
+AC_ARG_ENABLE([gif], AS_HELP_STRING([--disable-gif],
+                                    [Disable detection of GIF format support
+                                     @<:@default=auto@:>@]))
+AS_IF([test "x$enable_gif" != "xno"], [
+  CLEAR_LIBVARS([GIF])
+  WITHLIB_OPTION([gif], [GIF])
+
+  LIBCHECK_PROLOGUE([GIF])
+  AC_CHECK_HEADER(gif_lib.h,
+    AC_CHECK_LIB([gif], [DGifOpenFileHandle],
+                 [GIF_LIBS="$GIF_LIBS -lgif"
+                  AC_DEFINE(WEBP_HAVE_GIF, [1],
+                            [Set to 1 if GIF library is installed])
+                  gif_support=yes
+                 ],
+                 AC_MSG_WARN(Optional gif library not found),
+                 [$MATH_LIBS]),
+    AC_MSG_WARN(gif library not available - no gif_lib.h)
+  )
+  LIBCHECK_EPILOGUE([GIF])
+
+  if test "$gif_support" = "yes" -a \
+          "$enable_libwebpmux" = "yes"; then
+    build_gif2webp=yes
+  fi
+])
+AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])
+
+dnl === check for WIC support ===
+
+AC_ARG_ENABLE([wic],
+              AS_HELP_STRING([--disable-wic],
+                             [Disable Windows Imaging Component (WIC) detection.
+                              @<:@default=auto@:>@]),,
+              [enable_wic=yes])
+
+case $host_os in
+mingw*)
+if test "$enable_wic" = "yes"; then
+  AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
+  if test "$ac_cv_header_wincodec_h" = "yes"; then
+    AC_MSG_CHECKING(for Windows Imaging Component support)
+    SAVED_LIBS=$LIBS
+    LIBS="-lshlwapi -lole32 $LIBS"
+    # match include structure from [cd]webp.c
+    wic_headers="
+      #define INITGUID
+      #define CINTERFACE
+      #define COBJMACROS
+      #define _WIN32_IE 0x500
+
+      #include <shlwapi.h>
+      #include <windows.h>
+      #include <wincodec.h>
+      "
+    # test for functions from each lib and the GUID is created properly
+    wic_main="
+      int main(void) {
+        CLSID_WICImagingFactory;
+        CoInitialize(NULL);
+        SHCreateStreamOnFile(NULL, 0, NULL);
+        return 0;
+      }
+      "
+    AC_LANG_PUSH(C)
+    AC_LINK_IFELSE(
+      [AC_LANG_SOURCE([
+         $wic_headers
+         $wic_main])],
+      [wic_support=yes],
+      [wic_support=no]
+    )
+    AC_LANG_POP
+
+    test "$wic_support" = "yes" || LIBS=$SAVED_LIBS
+    AC_MSG_RESULT(${wic_support-no})
+  fi
+fi
+esac
+
+dnl === If --enable-aligned is defined, define WEBP_FORCE_ALIGNED
+
+AC_MSG_CHECKING(if --enable-aligned option is specified)
+AC_ARG_ENABLE([aligned],
+              AS_HELP_STRING([--enable-aligned],
+                             [Force aligned memory operations in non-dsp code
+                              (may be slower)]))
+if test "$enable_aligned" = "yes"; then
+  AC_DEFINE(WEBP_FORCE_ALIGNED, [1],
+            [Define to 1 to force aligned memory operations])
+fi
+AC_MSG_RESULT(${enable_aligned-no})
+
+dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP
+
+USE_SWAP_16BIT_CSP=""
+AC_MSG_CHECKING(if --enable-swap-16bit-csp option is specified)
+AC_ARG_ENABLE([swap-16bit-csp],
+              AS_HELP_STRING([--enable-swap-16bit-csp],
+                             [Enable byte swap for 16 bit colorspaces]))
+if test "$enable_swap_16bit_csp" = "yes"; then
+  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP"
+fi
+AC_MSG_RESULT(${enable_swap_16bit_csp-no})
+AC_SUBST(USE_SWAP_16BIT_CSP)
+
+dnl === If --enable-experimental is defined, add -DWEBP_EXPERIMENTAL_FEATURES
+
+USE_EXPERIMENTAL_CODE=""
+AC_MSG_CHECKING(if --enable-experimental option is specified)
+AC_ARG_ENABLE([experimental], AS_HELP_STRING([--enable-experimental],
+                                             [Activate experimental features]))
+if test "$enable_experimental" = "yes"; then
+  AC_DEFINE(WEBP_EXPERIMENTAL_FEATURES, [1], [Enable experimental code])
+  USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
+fi
+AC_MSG_RESULT(${enable_experimental-no})
+AC_SUBST(USE_EXPERIMENTAL_CODE)
+
+dnl === Check whether libwebpmux should be built
+AC_MSG_CHECKING(whether libwebpmux is to be built)
+AC_ARG_ENABLE([libwebpmux],
+              AS_HELP_STRING([--enable-libwebpmux],
+                             [Build libwebpmux @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpmux-no})
+AM_CONDITIONAL([WANT_MUX], [test "$enable_libwebpmux" = "yes"])
+
+dnl === Check whether libwebpdemux should be built
+AC_MSG_CHECKING(whether libwebpdemux is to be built)
+AC_ARG_ENABLE([libwebpdemux],
+              AS_HELP_STRING([--enable-libwebpdemux],
+                             [Build libwebpdemux @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpdemux-no})
+AM_CONDITIONAL([WANT_DEMUX], [test "$enable_libwebpdemux" = "yes"])
+
+dnl === Check whether decoder library should be built.
+AC_MSG_CHECKING(whether decoder library is to be built)
+AC_ARG_ENABLE([libwebpdecoder],
+              AS_HELP_STRING([--enable-libwebpdecoder],
+                             [Build libwebpdecoder @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpdecoder-no})
+AM_CONDITIONAL([BUILD_LIBWEBPDECODER], [test "$enable_libwebpdecoder" = "yes"])
+
+dnl =========================

 AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_FILES([Makefile src/Makefile man/Makefile examples/Makefile src/dec/Makefile src/enc/Makefile src/libwebp.pc])
+AC_CONFIG_HEADERS([src/webp/config.h])
+AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
+                 examples/Makefile src/dec/Makefile \
+                 src/enc/Makefile src/dsp/Makefile \
+                 src/demux/Makefile src/mux/Makefile \
+                 src/utils/Makefile \
+                 src/libwebp.pc src/libwebpdecoder.pc \
+                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc])


 AC_OUTPUT
+
+AC_MSG_NOTICE([
+WebP Configuration Summary
+--------------------------
+
+Shared libraries: ${enable_shared}
+Static libraries: ${enable_static}
+Threading support: ${enable_threading-no}
+libwebp: yes
+libwebpdecoder: ${enable_libwebpdecoder-no}
+libwebpdemux: ${enable_libwebpdemux-no}
+libwebpmux: ${enable_libwebpmux-no}
+
+Tools:
+cwebp : yes
+  Input format support
+  ====================
+  JPEG : ${jpeg_support-no}
+  PNG  : ${png_support-no}
+  TIFF : ${tiff_support-no}
+  WIC  : ${wic_support-no}
+dwebp : yes
+  Output format support
+  =====================
+  PNG  : ${png_support-no}
+  WIC  : ${wic_support-no}
+GIF support : ${gif_support-no}
+gif2webp    : ${build_gif2webp-no}
+webpmux     : ${enable_libwebpmux-no}
+vwebp       : ${build_vwebp-no}
+])
--- a/doc/README
+++ b/doc/README
@ -0,0 +1,29 @@
+
+Generate libwebp Container Spec Docs from Text Source
+=====================================================
+
+HTML generation requires kramdown [1], easily installed as a
+rubygem [2].  Rubygems installation should satisfy dependencies
+automatically.
+
+[1]: http://kramdown.rubyforge.org/
+[2]: http://rubygems.org/
+
+HTML generation can then be done from the project root:
+
+$ kramdown doc/webp-container-spec.txt --template doc/template.html > \
+  doc/output/webp-container-spec.html
+
+kramdown can optionally syntax highlight code blocks, using CodeRay [3],
+a dependency of kramdown that rubygems will install automatically.  The
+following will apply inline CSS styling; an external stylesheet is not
+needed.
+
+$ kramdown doc/webp-lossless-bitstream-spec.txt --template \
+  doc/template.html --coderay-css style --coderay-line-numbers ' ' \
+  --coderay-default-lang c > \
+  doc/output/webp-lossless-bitstream-spec.html
+
+Optimally, use kramdown 0.13.7 or newer if syntax highlighting desired.
+
+[3]: http://coderay.rubychan.de/
--- a/doc/TODO
+++ b/doc/TODO
@ -0,0 +1,13 @@
+<louquillio@google.com>, 20111004
+
+* Determine that normative RFC 2119 terms (MUST, SHOULD, MAY, etc.) are
+  truly intended in all cases where capitalized.
+
+* Several passages could be made clearer.
+
+  * Overall edit for scope.  Portions are phrased as an introduction to
+    the 0.1.3 RIFF container additions, rather than a holistic guide to
+    WebP.
+
+  * To wit, suggest s/[spec|specification]/guide/g .  "Spec" can imply a
+    standards track; in any case it's too formal for a work in progress.
--- a/doc/template.html
+++ b/doc/template.html
@ -0,0 +1,94 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>WebP Container Specification</title>
+  <meta name="generator" content="kramdown <%= ::Kramdown::VERSION %>" />
+  <style type="text/css">
+  body {
+    color: #000;
+    background-color: #fff;
+    margin: 10%;
+    font-family: "Liberation Sans", "DejaVu Sans", "Bitstream Vera Sans", Arial, sans-serif;
+    line-height: 1.4;
+  }
+  h2 {
+    border-bottom: 1px solid #ccc;
+    padding-bottom: 0;
+  }
+  table {
+    border-collapse: collapse;
+  }
+  th, td {
+    border: 1px solid #999;
+    padding: .5em .7em;;
+  }
+  th {
+    color: #fff;
+    background-color: #000;
+  }
+  td {
+  }
+  hr {
+  }
+  code {
+    color: #000;
+    background-color: #f7f7f7;
+    padding: 0 3px;
+    font-family: "Liberation Mono", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", Consolata, monospace;
+  }
+  pre {
+    background-color: #f7f7f7;
+    padding: 1em;
+    border: 1px solid #ccc;
+    width: 42em;
+    overflow: auto;
+    font-family: "Liberation Mono", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", Consolata, monospace;
+  }
+  pre code {
+    background-color: #f7f7f7;
+    padding: 0; /* Only want padding on inline code, not blocks */
+  }
+  pre.terminal {
+    color: #fff;
+    background-color: #000;
+    border: 1px solid #ccc;
+    max-height: 30em;
+  }
+  pre.terminal code {
+    color: #fff;
+    background-color: #000;
+    font-size: smaller;
+  }
+  #markdown-toc ul {
+    list-style-type: disc;
+  }
+  ul#markdown-toc {
+    margin-top: -1em;
+    visibility: hidden;
+    -webkit-padding-start: 0;
+  }
+  ul#markdown-toc ul {
+    visibility: visible;
+  }
+  ul#markdown-toc ul ul{
+    visibility: visible;
+  }
+  ul#markdown-toc + hr {
+    margin-bottom: 4em;
+  }
+  ol ol { /* Format nested ordered lists */
+    list-style-type: lower-alpha;
+  }
+  dt {
+    font-style: italic;
+    font-weight: bold;
+  }
+  .caption {
+  }
+  </style>
+</head>
+<body>
+<%= @body %>
+</body>
+</html>
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -0,0 +1,909 @@
+<!--
+
+Although you may be viewing an alternate representation, this document
+is sourced in Markdown, a light-duty markup scheme, and is optimized for
+the [kramdown](http://kramdown.rubyforge.org/) transformer.
+
+See the accompanying README. External link targets are referenced at the
+end of this file.
+
+-->
+
+
+WebP Container Specification
+============================
+
+* TOC placeholder
+{:toc}
+
+
+Introduction
+------------
+
+WebP is an image format that uses either (i) the VP8 key frame encoding
+to compress image data in a lossy way, or (ii) the WebP lossless encoding
+(and possibly other encodings in the future). These encoding schemes should
+make it more efficient than currently used formats. It is optimized for fast
+image transfer over the network (e.g., for websites). The WebP format has
+feature parity (color profile, metadata, animation etc) with other formats as
+well. This document describes the structure of a WebP file.
+
+The WebP container (i.e., RIFF container for WebP) allows feature support over
+and above the basic use case of WebP (i.e., a file containing a single image
+encoded as a VP8 key frame). The WebP container provides additional support
+for:
+
+  * **Lossless compression.** An image can be losslessly compressed, using the
+    WebP Lossless Format.
+
+  * **Metadata.** An image may have metadata stored in EXIF or XMP formats.
+
+  * **Transparency.** An image may have transparency, i.e., an alpha channel.
+
+  * **Color Profile.** An image may have an embedded ICC profile as described
+    by the [International Color Consortium][iccspec].
+
+  * **Animation.** An image may have multiple frames with pauses between them,
+    making it an animation.
+
+  * **Image Fragmentation.** A single bitstream in WebP has an inherent
+    limitation for width or height of 2^14 pixels, and, when using VP8, a 512
+    KiB limit on the size of the first compressed partition. To support larger
+    images, the format supports images that are composed of multiple fragments,
+    each encoded as a separate bitstream. All fragments logically form a single
+    image: they have common metadata, color profile, etc. Image fragmentation
+    may also improve efficiency for larger images, e.g., grass can be encoded
+    differently than sky.
+
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+document are to be interpreted as described in [RFC 2119][].
+
+**Note:** Out of the features mentioned above, lossy compression, lossless
+compression, transparency, metadata, color profile and animation are finalized
+and are to be considered stable. On the other hand, image fragmentation is
+experimental as of now, and is open to discussion, feedback and comments.
+The same is indicated using annotation "_status: experimental_" in the relevant
+sections of this document.
+
+Terminology &amp; Basics
+------------------------
+
+A WebP file contains either a still image (i.e., an encoded matrix of pixels)
+or an [animation](#animation). Optionally, it can also contain transparency
+information, color profile and metadata. In case we need to refer only to the
+matrix of pixels, we will call it the _canvas_ of the image.
+
+Below are additional terms used throughout this document:
+
+_Reader/Writer_
+
+: Code that reads WebP files is referred to as a _reader_, while code that
+writes them is referred to as a _writer_.
+
+_uint16_
+
+: A 16-bit, little-endian, unsigned integer.
+
+_uint24_
+
+: A 24-bit, little-endian, unsigned integer.
+
+_uint32_
+
+: A 32-bit, little-endian, unsigned integer.
+
+_FourCC_
+
+: A _FourCC_ (four-character code) is a _uint32_ created by concatenating four
+  ASCII characters in little-endian order.
+
+_1-based_
+
+: An unsigned integer field storing values offset by `-1`. e.g., Such a field
+would store value _25_ as _24_.
+
+RIFF file format
+----------------
+The WebP file format is based on the RIFF (resource interchange file format)
+document format.
+
+The basic element of a RIFF file is a _chunk_. It consists of:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Chunk FourCC                          |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                          Chunk Size                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Chunk Payload                         |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Chunk FourCC: 32 bits
+
+: ASCII four-character code used for chunk identification.
+
+Chunk Size: 32 bits (_uint32_)
+
+: The size of the chunk not including this field, the chunk identifier or
+  padding.
+
+Chunk Payload: _Chunk Size_ bytes
+
+: The data payload. If _Chunk Size_ is odd, a single padding byte -- that
+  SHOULD be `0` -- is added.
+
+_ChunkHeader('ABCD')_
+
+: This is used to describe the _FourCC_ and _Chunk Size_ header of individual
+  chunks, where 'ABCD' is the FourCC for the chunk. This element's
+  size is 8 bytes.
+
+**Note:** RIFF has a convention that all-uppercase chunk FourCCs are standard
+chunks that apply to any RIFF file format, while FourCCs specific to a file
+format are all lowercase. WebP does not follow this convention.
+
+WebP file header
+----------------
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |      'R'      |      'I'      |      'F'      |      'F'      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                           File Size                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |      'W'      |      'E'      |      'B'      |      'P'      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+'RIFF': 32 bits
+
+: The ASCII characters 'R' 'I' 'F' 'F'.
+
+File Size: 32 bits (_uint32_)
+
+: The size of the file in bytes starting at offset 8. The maximum value of
+this field is 2^32 minus 10 bytes and thus the size of the whole file is at
+most 4GiB minus 2 bytes.
+
+'WEBP': 32 bits
+
+: The ASCII characters 'W' 'E' 'B' 'P'.
+
+A WebP file MUST begin with a RIFF header with the FourCC 'WEBP'. The file size
+in the header is the total size of the chunks that follow plus `4` bytes for
+the 'WEBP' FourCC. The file SHOULD NOT contain anything after it. As the size
+of any chunk is even, the size given by the RIFF header is also even. The
+contents of individual chunks will be described in the following sections.
+
+Simple file format (lossy)
+--------------------------
+
+This layout SHOULD be used if the image requires _lossy_ encoding and does not
+require transparency or other advanced features provided by the extended format.
+Files with this layout are smaller and supported by older software.
+
+Simple WebP (lossy) file format:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                    WebP file header (12 bytes)                |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                          VP8 chunk                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8 chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('VP8 ')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                           VP8 data                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8 data: _Chunk Size_ bytes
+
+: VP8 bitstream data.
+
+The VP8 bitstream format specification can be found at [VP8 Data Format and
+Decoding Guide][vp8spec]. Note that the VP8 frame header contains the VP8 frame
+width and height. That is assumed to be the width and height of the canvas.
+
+The VP8 specification describes how to decode the image into Y'CbCr
+format. To convert to RGB, Rec. 601 SHOULD be used.
+
+Simple file format (lossless)
+-----------------------------
+
+**Note:** Older readers may not support files using the lossless format.
+
+This layout SHOULD be used if the image requires _lossless_ encoding (with an
+optional transparency channel) and does not require advanced features provided
+by the extended format.
+
+Simple WebP (lossless) file format:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                    WebP file header (12 bytes)                |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                          VP8L chunk                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8L chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('VP8L')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                           VP8L data                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8L data: _Chunk Size_ bytes
+
+: VP8L bitstream data.
+
+The current specification of the VP8L bitstream can be found at
+[WebP Lossless Bitstream Format][webpllspec]. Note that the VP8L header
+contains the VP8L image width and height. That is assumed to be the width
+and height of the canvas.
+
+Extended file format
+--------------------
+
+**Note:** Older readers may not support files using the extended format.
+
+An extended format file consists of:
+
+  * A 'VP8X' chunk with information about features used in the file.
+
+  * An optional 'ICCP' chunk with color profile.
+
+  * An optional 'ANIM' chunk with animation control data.
+
+  * Image data.
+
+  * An optional 'EXIF' chunk with EXIF metadata.
+
+  * An optional 'XMP ' chunk with XMP metadata.
+
+  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_
+
+For a _still image_, the _image data_ consists of a single frame, whereas for
+an _animated image_, it consists of multiple frames. More details about frames
+can be found in the [Animation](#animation) section.
+
+Moreover, each frame can be fragmented or non-fragmented, as will be described
+in the [Extended WebP file header](#extended_header) section. More details about
+fragments can be found in the [Fragments](#fragments) section.
+
+All chunks SHOULD be placed in the same order as listed above. If a chunk
+appears in the wrong place, the file is invalid, but readers MAY parse the
+file, ignoring the chunks that come too late.
+
+**Rationale:** Setting the order of chunks should allow quicker file
+parsing. For example, if an 'ALPH' chunk does not appear in its required
+position, a decoder can choose to stop searching for it. The rule of
+ignoring late chunks should make programs that need to do a full search
+give the same results as the ones stopping early.
+
+Extended WebP file header:
+{:#extended_header}
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                   WebP file header (12 bytes)                 |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('VP8X')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |Rsv|I|L|E|X|A|F|                   Reserved                    |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |          Canvas Width Minus One               |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...  Canvas Height Minus One    |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Reserved (Rsv): 2 bits
+
+: SHOULD be `0`.
+
+ICC profile (I): 1 bit
+
+: Set if the file contains an ICC profile.
+
+Alpha (L): 1 bit
+
+: Set if any of the frames of the image contain transparency information
+("alpha").
+
+EXIF metadata (E): 1 bit
+
+: Set if the file contains EXIF metadata.
+
+XMP metadata (X): 1 bit
+
+: Set if the file contains XMP metadata.
+
+Animation (A): 1 bit
+
+: Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
+used to control the animation.
+
+Image Fragmentation (F): 1 bit _\[status: experimental\]_
+
+: Set if any of the frames in the image are represented by fragments.
+
+Reserved: 24 bits
+
+: SHOULD be `0`.
+
+Canvas Width Minus One: 24 bits
+
+: _1-based_ width of the canvas in pixels.
+  The actual canvas width is '1 + Canvas Width Minus One'
+
+Canvas Height Minus One: 24 bits
+
+: _1-based_ height of the canvas in pixels.
+  The actual canvas height is '1 + Canvas Height Minus One'
+
+The product of _Canvas Width_ and _Canvas Height_ MUST be at most `2^32 - 1`.
+
+Future specifications MAY add more fields.
+
+### Chunks
+
+#### Animation
+
+An animation is controlled by ANIM and ANMF chunks.
+
+ANIM Chunk:
+{:#anim_chunk}
+
+For an animated image, this chunk contains the _global parameters_ of the
+animation.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ANIM')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                       Background Color                        |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |          Loop Count           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Background Color: 32 bits (_uint32_)
+
+: The default background color of the canvas in \[Blue, Green, Red, Alpha\]
+byte order. This color MAY be used to fill the unused space on the canvas around
+the frames, as well as the transparent pixels of the first frame. Background
+color is also used when disposal method is `1`.
+
+**Note**:
+
+  * Background color MAY contain a transparency value (alpha), even if the
+    _Alpha_ flag in [VP8X chunk](#extended_header) is unset.
+
+  * Viewer applications SHOULD treat the background color value as a hint, and
+    are not required to use it.
+
+Loop Count: 16 bits (_uint16_)
+
+: The number of times to loop the animation. `0` means infinitely.
+
+This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
+If the _Animation_ flag is not set and this chunk is present, it
+SHOULD be ignored.
+
+
+ANMF chunk:
+
+For animated images, this chunk contains information about a _single_ frame.
+If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ANMF')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        Frame X                |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...          Frame Y            |   Frame Width Minus One     ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...             |           Frame Height Minus One              |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                 Frame Duration                |  Reserved |B|D|
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Frame Data                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Frame X: 24 bits (_uint24_)
+
+: The X coordinate of the upper left corner of the frame is `Frame X * 2`
+
+Frame Y: 24 bits (_uint24_)
+
+: The Y coordinate of the upper left corner of the frame is `Frame Y * 2`
+
+Frame Width Minus One: 24 bits (_uint24_)
+
+: The _1-based_ width of the frame.
+  The frame width is `1 + Frame Width Minus One`
+
+Frame Height Minus One: 24 bits (_uint24_)
+
+: The _1-based_ height of the frame.
+  The frame height is `1 + Frame Height Minus One`
+
+Frame Duration: 24 bits (_uint24_)
+
+: The time to wait before displaying the next frame, in 1 millisecond units.
+In particular, frame duration of 0 is useful when one wants to update multiple
+areas of the canvas at once during the animation.
+
+Reserved: 6 bits
+
+: SHOULD be 0.
+
+Blending method (B): 1 bit
+
+: Indicates how transparent pixels of _the current frame_ are to be blended with
+corresponding pixels of the previous canvas:
+
+  * `0`: Use alpha blending. After disposing of the previous frame, render the
+    current frame on the canvas using [alpha-blending](#alpha-blending). If the
+    current frame does not have an alpha channel, assume alpha value of 255,
+    effectively replacing the rectangle.
+
+  * `1`: Do not blend. After disposing of the previous frame, render the
+    current frame on the canvas by overwriting the rectangle covered by the
+    current frame.
+
+Disposal method (D): 1 bit
+
+: Indicates how _the current frame_ is to be treated after it has been displayed
+(before rendering the next frame) on the canvas:
+
+  * `0`: Do not dispose. Leave the canvas as is.
+
+  * `1`: Dispose to background color. Fill the _rectangle_ on the canvas covered
+    by the _current frame_ with background color specified in the
+    [ANIM chunk](#anim_chunk).
+
+**Notes**:
+
+  * The frame disposal only applies to the _frame rectangle_, that is, the
+    rectangle defined by _Frame X_, _Frame Y_, _frame width_ and _frame height_.
+    It may or may not cover the whole canvas.
+
+{:#alpha-blending}
+  * **Alpha-blending**:
+
+    Given that each of the R, G, B and A channels is 8-bit, and the RGB
+    channels are _not premultiplied_ by alpha, the formula for blending
+    'dst' onto 'src' is:
+
+~~~~~
+    blend.A = src.A + dst.A * (1 - src.A / 255)
+    if blend.A = 0 then
+      blend.RGB = 0
+    else
+      blend.RGB = (src.RGB * src.A +
+                   dst.RGB * dst.A * (1 - src.A / 255)) / blend.A
+~~~~~
+
+  * Alpha-blending SHOULD be done in linear color space, by taking into account
+    the [color profile](#color-profile) of the image. If the color profile is
+    not present, sRGB is to be assumed. (Note that sRGB also needs to be
+    linearized due to a gamma of ~2.2).
+
+Frame Data: _Chunk Size_ - `16` bytes
+
+: For a fragmented frame, it consists of multiple [fragment chunks](#fragments).
+
+: For a non-fragmented frame, it consists of:
+
+  * An optional [alpha subchunk](#alpha) for the frame.
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l) for the frame.
+
+  * An optional list of [unknown chunks](#unknown-chunks).
+
+**Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
+_padded_ chunks as described by the [RIFF file format](#riff-file-format).
+
+#### Fragments _\[status: experimental\]_
+
+For images that are represented by fragments, this chunk contains data for
+a single fragment. If the _Image Fragmentation Flag_ is not set, then this chunk
+SHOULD NOT be present.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('FRGM')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                  Fragment X                   |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...       Fragment Y            |         Fragment Data         |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Fragment X: 24 bits (_uint24_)
+
+: The X coordinate of the upper left corner of the fragment is `Fragment X * 2`
+
+Fragment Y: 24 bits (_uint24_)
+
+: The Y coordinate of the upper left corner of the fragment is `Fragment Y * 2`
+
+Fragment Data: _Chunk Size_ - `6` bytes
+
+: It contains:
+
+  * An optional [alpha subchunk](#alpha) for the fragment.
+  * The [bitstream subchunk](#bitstream-vp8vp8l) for the fragment.
+  * An optional list of [unknown chunks](#unknown-chunks).
+
+Note: The width and height of the fragment is obtained from the bitstream
+subchunk.
+
+The fragments of a frame SHOULD have the following properties:
+
+  * They collectively cover the whole frame.
+
+  * No pair of fragments have any overlapping region on the frame.
+
+  * No portion of any fragment should be located outside of the canvas.
+
+#### Alpha
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ALPH')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |Rsv| P | F | C |     Alpha Bitstream...                        |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Reserved (Rsv): 2 bits
+
+: SHOULD be `0`.
+
+Pre-processing (P): 2 bits
+
+: These INFORMATIVE bits are used to signal the pre-processing that has
+been performed during compression. The decoder can use this information to
+e.g. dither the values or smooth the gradients prior to display.
+
+  * `0`: no pre-processing
+  * `1`: level reduction
+
+Filtering method (F): 2 bits
+
+: The filtering method used:
+
+  * `0`: None.
+  * `1`: Horizontal filter.
+  * `2`: Vertical filter.
+  * `3`: Gradient filter.
+
+For each pixel, filtering is performed using the following calculations.
+Assume the alpha values surrounding the current `X` position are labeled as:
+
+     C | B |
+    ---+---+
+     A | X |
+
+We seek to compute the alpha value at position `X`. First, a prediction is
+made depending on the filtering method:
+
+  * Method `0`: predictor = 0
+  * Method `1`: predictor = A
+  * Method `2`: predictor = B
+  * Method `3`: predictor = clip(A + B - C)
+
+where `clip(v)` is equal to:
+
+  * 0    if v < 0
+  * 255  if v > 255
+  * v    otherwise
+
+The final value is derived by adding the decompressed value `X` to the
+predictor and using modulo-256 arithmetic to wrap the \[256-511\] range
+into the \[0-255\] one:
+
+`alpha = (predictor + X) % 256`
+
+There are special cases for left-most and top-most pixel positions:
+
+  * Top-left value at location (0,0) uses 0 as predictor value. Otherwise,
+  * For horizontal or gradient filtering methods, the left-most pixels at
+    location (0, y) are predicted using the location (0, y-1) just above.
+  * For vertical or gradient filtering methods, the top-most pixels at
+    location (x, 0) are predicted using the location (x-1, 0) on the left.
+
+
+Decoders are not required to use this information in any specified way.
+
+Compression method (C): 2 bits
+
+: The compression method used:
+
+  * `0`: No compression.
+  * `1`: Compressed using the WebP lossless format.
+
+Alpha bitstream: _Chunk Size_ - `1` bytes
+
+: Encoded alpha bitstream.
+
+This optional chunk contains encoded alpha data for this frame/fragment. A
+frame/fragment containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+
+**Rationale**: The transparency information is already part of the 'VP8L'
+chunk.
+
+The alpha channel data is stored as uncompressed raw data (when
+compression method is '0') or compressed using the lossless format
+(when the compression method is '1').
+
+  * Raw data: consists of a byte sequence of length width * height,
+    containing all the 8-bit transparency values in scan order.
+
+  * Lossless format compression: the byte sequence is a compressed
+    image-stream (as described in the [WebP Lossless Bitstream Format]
+    [webpllspec]) of implicit dimension width x height. That is, this
+    image-stream does NOT contain any headers describing the image dimension.
+
+    **Rationale**: the dimension is already known from other sources,
+    so storing it again would be redundant and error-prone.
+
+    Once the image-stream is decoded into ARGB color values, following
+    the process described in the lossless format specification, the
+    transparency information must be extracted from the *green* channel
+    of the ARGB quadruplet.
+
+    **Rationale**: the green channel is allowed extra transformation
+    steps in the specification -- unlike the other channels -- that can
+    improve compression.
+
+#### Bitstream (VP8/VP8L)
+
+This chunk contains compressed bitstream data for a single frame/fragment.
+
+A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
+significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
+"VP8L" as its tag.
+
+The formats of VP8 and VP8L chunks are as described in sections
+[Simple file format (lossy)](#simple-file-format-lossy)
+and [Simple file format (lossless)](#simple-file-format-lossless) respectively.
+
+#### Color profile
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ICCP')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                       Color Profile                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Color Profile: _Chunk Size_ bytes
+
+: ICC profile.
+
+This chunk MUST appear before the image data.
+
+There SHOULD be at most one such chunk. If there are more such chunks, readers
+MAY ignore all except the first one.
+See the [ICC Specification][iccspec] for details.
+
+If this chunk is not present, sRGB SHOULD be assumed.
+
+#### Metadata
+
+Metadata can be stored in 'EXIF' or 'XMP ' chunks.
+
+There SHOULD be at most one chunk of each type ('EXIF' and 'XMP '). If there
+are more such chunks, readers MAY ignore all except the first one. Also, a file
+may possibly contain both 'EXIF' and 'XMP ' chunks.
+
+The chunks are defined as follows:
+
+EXIF chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('EXIF')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        EXIF Metadata                          |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+EXIF Metadata: _Chunk Size_ bytes
+
+: image metadata in EXIF format.
+
+
+XMP chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('XMP ')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        XMP Metadata                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+XMP Metadata: _Chunk Size_ bytes
+
+: image metadata in XMP format.
+
+Additional guidance about handling metadata can be found in the
+Metadata Working Group's [Guidelines for Handling Metadata][metadata].
+
+#### Unknown Chunks _\[status: experimental\]_
+
+A RIFF chunk (described in [this](#terminology-amp-basics) section) whose _chunk
+tag_ is different from any of the chunks described in this document, is
+considered an _unknown chunk_.
+
+**Rationale**: Allowing unknown chunks gives a provision for future extension
+of the format, and also allows storage of any application-specific data.
+
+A file MAY contain unknown chunks:
+
+  * At the end of the file as described in [Extended WebP file
+    header](#extended_header) section.
+  * At the end of FRGM and ANMF chunks as described in [Fragments](#fragments)
+    and [Animation](#animation) sections.
+
+Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
+original order (unless they specifically intend to modify these chunks).
+
+### Assembling the Canvas from fragments/frames
+
+Here we provide an overview of how a reader should assemble a canvas in case
+of a fragmented-image and in case of an animated image. The notation
+_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
+
+Displaying a _fragmented image_ canvas MUST be equivalent to the following
+pseudocode: _\[status: experimental\]_
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+assert VP8X.flags.hasFragments
+canvas ← new black image of size VP8X.canvasWidth x VP8X.canvasHeight.
+frgm_params ← nil
+for chunk in image_data:
+    assert chunk.tag is "FRGM"
+    frgm_params.fragmentX = Fragment X
+    frgm_params.fragmentY = Fragment Y
+    for subchunk in 'Fragment Data':
+        if subchunk.tag == "ALPH":
+            assert alpha subchunks not found in 'Fragment Data' earlier
+            frgm_params.alpha = alpha_data
+        else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
+            assert bitstream subchunks not found in 'Fragment Data' earlier
+            frgm_params.bitstream = bitstream_data
+    frgm_params.fragmentWidth = Width extracted from bitstream subchunk
+    frgm_params.fragmentHeight = Height extracted from bitstream subchunk
+    assert VP8X.canvasWidth >=
+        frgm_params.fragmentX + frgm_params.fragmentWidth
+    assert VP8X.canvasHeight >=
+        frgm_params.fragmentY + frgm_params.fragmentHeight
+    assert fragment has the properties mentioned in "Image Fragments" section.
+    render fragment with frame_params.alpha and frame_params.bitstream on canvas
+    with top-left corner in (frgm_params.fragmentX, frgm_params.fragmentY).
+canvas contains the decoded canvas.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Displaying an _animated image_ canvas MUST be equivalent to the following
+pseudocode:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+assert VP8X.flags.hasAnimation
+canvas ← new image of size VP8X.canvasWidth x VP8X.canvasHeight with
+background color ANIM.background_color.
+loop_count ← ANIM.loopCount
+dispose_method ← ANIM.disposeMethod
+if loop_count == 0:
+    loop_count = ∞
+frame_params ← nil
+for loop = 0, ..., loop_count - 1
+    assert next chunk in image_data is ANMF
+    frame_params.frameX = Frame X
+    frame_params.frameY = Frame Y
+    frame_params.frameWidth = Frame Width Minus One + 1
+    frame_params.frameHeight = Frame Height Minus One + 1
+    frame_params.frameDuration = Frame Duration
+    assert VP8X.canvasWidth >= frame_params.frameX + frame_params.frameWidth
+    assert VP8X.canvasHeight >= frame_params.frameY + frame_params.frameHeight
+    if VP8X.flags.hasFragments and first subchunk in 'Frame Data' is FRGM
+        // Fragmented frame.
+        frame_params.{bitstream,alpha} = canvas decoded from subchunks in
+                                         'Frame Data' as per the pseudocode for
+                                         _fragmented image_ above.
+    else
+        // Non-fragmented frame.
+        for subchunk in 'Frame Data':
+            if subchunk.tag == "ALPH":
+                assert alpha subchunks not found in 'Frame Data' earlier
+                frame_params.alpha = alpha_data
+            else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
+                assert bitstream subchunks not found in 'Frame Data' earlier
+                frame_params.bitstream = bitstream_data
+    render frame with frame_params.alpha and frame_params.bitstream on canvas
+    with top-left corner in (frame_params.frameX, frame_params.frameY), using
+    dispose method dispose_method.
+    Show the contents of the image for frame_params.frameDuration * 1ms.
+canvas contains the decoded canvas.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Example file layouts
+--------------------
+
+A lossy encoded image with alpha may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ALPH (alpha bitstream)
+- VP8 (bitstream)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A losslessly encoded image may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- XYZW (unknown chunk)
+- VP8L (lossless bitstream)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A lossless image with ICC profile and XMP metadata may
+look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ICCP (color profile)
+- VP8L (lossless bitstream)
+- XMP  (metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A fragmented image may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- FRGM (fragment1 parameters + data)
+- FRGM (fragment2 parameters + data)
+- FRGM (fragment3 parameters + data)
+- FRGM (fragment4 parameters + data)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An animated image with EXIF metadata may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ANIM (global animation parameters)
+- ANMF (frame1 parameters + data)
+- ANMF (frame2 parameters + data)
+- ANMF (frame3 parameters + data)
+- ANMF (frame4 parameters + data)
+- EXIF (metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+[vp8spec]:  http://tools.ietf.org/html/rfc6386
+[webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[iccspec]: http://www.color.org/icc_specs2.xalter
+[metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
+[rfc 2119]: http://tools.ietf.org/html/rfc2119
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
--- a/examples/Android.mk
+++ b/examples/Android.mk
@ -0,0 +1,46 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    example_util.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+
+LOCAL_MODULE := example_util
+
+include $(BUILD_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+# Note: to enable jpeg/png encoding the sources from AOSP can be used with
+# minor modification to their Android.mk files.
+LOCAL_SRC_FILES := \
+    cwebp.c \
+    jpegdec.c \
+    metadata.c \
+    pngdec.c \
+    tiffdec.c \
+    webpdec.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+LOCAL_STATIC_LIBRARIES := example_util webp
+
+LOCAL_MODULE := cwebp
+
+include $(BUILD_EXECUTABLE)
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    dwebp.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+LOCAL_STATIC_LIBRARIES := example_util webp
+
+LOCAL_MODULE := dwebp
+
+include $(BUILD_EXECUTABLE)
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,11 +1,54 @@
-AM_CPPFLAGS = -I$(top_srcdir)/src
+AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
+if BUILD_VWEBP
+  bin_PROGRAMS += vwebp
+endif
+if WANT_MUX
+  bin_PROGRAMS += webpmux
+endif
+
+if BUILD_GIF2WEBP
+  bin_PROGRAMS += gif2webp
+endif
+
+noinst_LTLIBRARIES = libexampleutil.la
+
+libexampleutil_la_SOURCES = example_util.c example_util.h stopwatch.h

 dwebp_SOURCES = dwebp.c stopwatch.h
-dwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES)
-dwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
+dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
+dwebp_LDADD = libexampleutil.la $(PNG_LIBS) $(JPEG_LIBS)

-cwebp_SOURCES = cwebp.c stopwatch.h
-cwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES)
-cwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
+cwebp_SOURCES  = cwebp.c metadata.c metadata.h stopwatch.h
+cwebp_SOURCES += jpegdec.c jpegdec.h
+cwebp_SOURCES += pngdec.c pngdec.h
+cwebp_SOURCES += tiffdec.c tiffdec.h
+cwebp_SOURCES += webpdec.c webpdec.h
+cwebp_SOURCES += wicdec.c wicdec.h
+cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
+cwebp_LDADD  = libexampleutil.la ../src/libwebp.la
+cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)
+
+gif2webp_SOURCES = gif2webp.c gif2webp_util.c
+gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
+gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
+gif2webp_LDADD += $(GIF_LIBS)
+
+webpmux_SOURCES = webpmux.c
+webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+webpmux_LDADD = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
+
+vwebp_SOURCES = vwebp.c
+vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
+vwebp_LDADD = libexampleutil.la ../src/demux/libwebpdemux.la $(GL_LIBS)
+
+if BUILD_LIBWEBPDECODER
+  dwebp_LDADD += ../src/libwebpdecoder.la
+  vwebp_LDADD += ../src/libwebpdecoder.la
+else
+  dwebp_LDADD += ../src/libwebp.la
+  vwebp_LDADD += ../src/libwebp.la
+endif
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -1,14 +1,13 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  simple command-line example calling libwebpdecode to
-//  decode a WebP image into a PPM image.
-//
-//  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
+//  Command-line tool for decoding a WebP image.
 //
 // Author: Skal (pascal.massimino@gmail.com)

@ -17,41 +16,67 @@
 #include <stdlib.h>
 #include <string.h>

+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
 #ifdef WEBP_HAVE_PNG
 #include <png.h>
 #endif

-#ifdef _WIN32
+#ifdef HAVE_WINCODEC_H
+#ifdef __MINGW32__
+#define INITGUID  // Without this GUIDs are declared extern and fail to link
+#endif
 #define CINTERFACE
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
                         // code with COBJMACROS.
+#include <ole2.h>  // CreateStreamOnHGlobal()
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>
 #endif

 #include "webp/decode.h"
-#include "stopwatch.h"
+#include "./example_util.h"
+#include "./stopwatch.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+static int verbose = 0;
+#ifndef WEBP_DLL
+#ifdef __cplusplus
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+extern void* VP8GetCPUInfo;   // opaque forward declaration.

-static int verbose = 0;
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+#endif  // WEBP_DLL

-#ifdef _WIN32
+//------------------------------------------------------------------------------

-#define IFS(fn)                \
-  do {                         \
-     if (SUCCEEDED(hr))        \
-     {                         \
-        hr = (fn);             \
-        if (FAILED(hr) && verbose)           \
-          printf(#fn " failed %08x\n", hr);  \
-     }                         \
+// Output types
+typedef enum {
+  PNG = 0,
+  PAM,
+  PPM,
+  PGM,
+  BMP,
+  TIFF,
+  YUV,
+  ALPHA_PLANE_ONLY  // this is for experimenting only
+} OutputFileFormat;
+
+#ifdef HAVE_WINCODEC_H
+
+#define IFS(fn)                                                     \
+  do {                                                              \
+    if (SUCCEEDED(hr)) {                                            \
+      hr = (fn);                                                    \
+      if (FAILED(hr)) fprintf(stderr, #fn " failed %08lx\n", hr);   \
+    }                                                               \
  } while (0)

 #ifdef __cplusplus
@ -60,75 +85,119 @@ static int verbose = 0;
 #define MAKE_REFGUID(x) &(x)
 #endif

-static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream) {
+static HRESULT CreateOutputStream(const char* out_file_name,
+                                  int write_to_mem, IStream** stream) {
  HRESULT hr = S_OK;
-  IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
-  if (FAILED(hr))
-    printf("Error opening output file %s (%08x)\n", out_file_name, hr);
+  if (write_to_mem) {
+    // Output to a memory buffer. This is freed when 'stream' is released.
+    IFS(CreateStreamOnHGlobal(NULL, TRUE, stream));
+  } else {
+    IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, stream));
+  }
+  if (FAILED(hr)) {
+    fprintf(stderr, "Error opening output file %s (%08lx)\n",
+            out_file_name, hr);
+  }
  return hr;
 }

-static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
-                             unsigned char* rgb, int stride,
-                             uint32_t width, uint32_t height) {
+static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
+                             REFGUID container_guid,
+                             uint8_t* rgb, int stride,
+                             uint32_t width, uint32_t height, int has_alpha) {
  HRESULT hr = S_OK;
-  IWICImagingFactory* pFactory = NULL;
-  IWICBitmapFrameEncode* pFrame = NULL;
-  IWICBitmapEncoder* pEncoder = NULL;
-  IStream* pStream = NULL;
-  GUID pixel_format = GUID_WICPixelFormat24bppBGR;
+  IWICImagingFactory* factory = NULL;
+  IWICBitmapFrameEncode* frame = NULL;
+  IWICBitmapEncoder* encoder = NULL;
+  IStream* stream = NULL;
+  WICPixelFormatGUID pixel_format = has_alpha ? GUID_WICPixelFormat32bppBGRA
+                                              : GUID_WICPixelFormat24bppBGR;

  IFS(CoInitialize(NULL));
  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
-          CLSCTX_INPROC_SERVER, MAKE_REFGUID(IID_IWICImagingFactory),
-          (LPVOID*)&pFactory));
+                       CLSCTX_INPROC_SERVER,
+                       MAKE_REFGUID(IID_IWICImagingFactory),
+                       (LPVOID*)&factory));
  if (hr == REGDB_E_CLASSNOTREG) {
-    printf("Couldn't access Windows Imaging Component (are you running \n");
-    printf("Windows XP SP3 or newer?). PNG support not available.\n");
-    printf("Use -ppm or -pgm for available PPM and PGM formats.\n");
+    fprintf(stderr,
+            "Couldn't access Windows Imaging Component (are you running "
+            "Windows XP SP3 or newer?). PNG support not available. "
+            "Use -ppm or -pgm for available PPM and PGM formats.\n");
  }
-  IFS(CreateOutputStream(out_file_name, &pStream));
-  IFS(IWICImagingFactory_CreateEncoder(pFactory, container_guid, NULL,
-          &pEncoder));
-  IFS(IWICBitmapEncoder_Initialize(pEncoder, pStream,
+  IFS(CreateOutputStream(out_file_name, use_stdout, &stream));
+  IFS(IWICImagingFactory_CreateEncoder(factory, container_guid, NULL,
+                                       &encoder));
+  IFS(IWICBitmapEncoder_Initialize(encoder, stream,
                                   WICBitmapEncoderNoCache));
-  IFS(IWICBitmapEncoder_CreateNewFrame(pEncoder, &pFrame, NULL));
-  IFS(IWICBitmapFrameEncode_Initialize(pFrame, NULL));
-  IFS(IWICBitmapFrameEncode_SetSize(pFrame, width, height));
-  IFS(IWICBitmapFrameEncode_SetPixelFormat(pFrame, &pixel_format));
-  IFS(IWICBitmapFrameEncode_WritePixels(pFrame, height, stride,
-          height * stride, rgb));
-  IFS(IWICBitmapFrameEncode_Commit(pFrame));
-  IFS(IWICBitmapEncoder_Commit(pEncoder));
+  IFS(IWICBitmapEncoder_CreateNewFrame(encoder, &frame, NULL));
+  IFS(IWICBitmapFrameEncode_Initialize(frame, NULL));
+  IFS(IWICBitmapFrameEncode_SetSize(frame, width, height));
+  IFS(IWICBitmapFrameEncode_SetPixelFormat(frame, &pixel_format));
+  IFS(IWICBitmapFrameEncode_WritePixels(frame, height, stride,
+                                        height * stride, rgb));
+  IFS(IWICBitmapFrameEncode_Commit(frame));
+  IFS(IWICBitmapEncoder_Commit(encoder));

-  if (pFrame != NULL) IUnknown_Release(pFrame);
-  if (pEncoder != NULL) IUnknown_Release(pEncoder);
-  if (pFactory != NULL) IUnknown_Release(pFactory);
-  if (pStream != NULL) IUnknown_Release(pStream);
+  if (SUCCEEDED(hr) && use_stdout) {
+    HGLOBAL image;
+    IFS(GetHGlobalFromStream(stream, &image));
+    if (SUCCEEDED(hr)) {
+      HANDLE std_output = GetStdHandle(STD_OUTPUT_HANDLE);
+      DWORD mode;
+      const BOOL update_mode = GetConsoleMode(std_output, &mode);
+      const void* const image_mem = GlobalLock(image);
+      DWORD bytes_written = 0;
+
+      // Clear output processing if necessary, then output the image.
+      if (update_mode) SetConsoleMode(std_output, 0);
+      if (!WriteFile(std_output, image_mem, (DWORD)GlobalSize(image),
+                     &bytes_written, NULL) ||
+          bytes_written != GlobalSize(image)) {
+        hr = E_FAIL;
+      }
+      if (update_mode) SetConsoleMode(std_output, mode);
+      GlobalUnlock(image);
+    }
+  }
+
+  if (frame != NULL) IUnknown_Release(frame);
+  if (encoder != NULL) IUnknown_Release(encoder);
+  if (factory != NULL) IUnknown_Release(factory);
+  if (stream != NULL) IUnknown_Release(stream);
  return hr;
 }

-static int WritePNG(const char* out_file_name, unsigned char* rgb, int stride,
-                    uint32_t width, uint32_t height) {
-  return SUCCEEDED(WriteUsingWIC(out_file_name,
-             MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
-             height));
+static int WritePNG(const char* out_file_name, int use_stdout,
+                    const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  uint8_t* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_BGRA);
+
+  return SUCCEEDED(WriteUsingWIC(out_file_name, use_stdout,
+                                 MAKE_REFGUID(GUID_ContainerFormatPng),
+                                 rgb, stride, width, height, has_alpha));
 }

-#elif defined(WEBP_HAVE_PNG)    // !WIN32
-static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
+#elif defined(WEBP_HAVE_PNG)    // !HAVE_WINCODEC_H
+static void PNGAPI PNGErrorFunction(png_structp png, png_const_charp dummy) {
  (void)dummy;  // remove variable-unused warning
  longjmp(png_jmpbuf(png), 1);
 }

-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  uint8_t* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_RGBA);
  png_structp png;
  png_infop info;
  png_uint_32 y;

  png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
-                                NULL, error_function, NULL);
+                                NULL, PNGErrorFunction, NULL);
  if (png == NULL) {
    return 0;
  }
@ -142,7 +211,8 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
    return 0;
  }
  png_init_io(png, out_file);
-  png_set_IHDR(png, info, width, height, 8, PNG_COLOR_TYPE_RGB,
+  png_set_IHDR(png, info, width, height, 8,
+               has_alpha ? PNG_COLOR_TYPE_RGBA : PNG_COLOR_TYPE_RGB,
               PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
               PNG_FILTER_TYPE_DEFAULT);
  png_write_info(png, info);
@ -154,218 +224,527 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
  png_destroy_write_struct(&png, &info);
  return 1;
 }
-#else    // !WIN32 && !WEBP_HAVE_PNG
-
-typedef uint32_t png_uint_32;
-
-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height) {
-  printf("PNG support not compiled. Please install the libpng development "
-         "package before building.\n");
-  printf("You can run with -ppm flag to decode in PPM format.\n");
+#else    // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
+  (void)out_file;
+  (void)buffer;
+  fprintf(stderr, "PNG support not compiled. Please install the libpng "
+          "development package before building.\n");
+  fprintf(stderr, "You can run with -ppm flag to decode in PPM format.\n");
  return 0;
 }
 #endif

-static int WritePPM(FILE* fout, unsigned char* rgb,
-                    uint32_t width, uint32_t height) {
-  fprintf(fout, "P6\n%d %d\n255\n", width, height);
-  return (fwrite(rgb, width * height, 3, fout) == 3);
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const size_t bytes_per_px = alpha ? 4 : 3;
+  uint32_t y;
+
+  if (alpha) {
+    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
+  } else {
+    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+  }
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
+      return 0;
+    }
+  }
+  return 1;
 }

-static int WritePGM(FILE* fout,
-                    unsigned char* y_plane, unsigned char *u, unsigned char* v,
-                    int y_stride, int uv_stride,
-                    uint32_t width, uint32_t height) {
-  // Save a grayscale PGM file using the IMC4 layout
-  // (http://www.fourcc.org/yuv.php#IMC4). This is a very
-  // convenient format for viewing the samples, esp. for
-  // odd dimensions.
-  int ok = 1;
-  unsigned int y;
-  const unsigned int uv_width = (width + 1) / 2;
-  const unsigned int uv_height = (height + 1) / 2;
-  const unsigned int out_stride = (width + 1) & ~1;
-  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height);
-  for (y = 0; ok && y < height; ++y) {
-    ok &= (fwrite(y_plane + y * y_stride, width, 1, fout) == 1);
-    if (width & 1) fputc(0, fout);    // padding byte
+static void PutLE16(uint8_t* const dst, uint32_t value) {
+  dst[0] = (value >> 0) & 0xff;
+  dst[1] = (value >> 8) & 0xff;
+}
+
+static void PutLE32(uint8_t* const dst, uint32_t value) {
+  PutLE16(dst + 0, (value >>  0) & 0xffff);
+  PutLE16(dst + 2, (value >> 16) & 0xffff);
+}
+
+#define BMP_HEADER_SIZE 54
+static int WriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int has_alpha = (buffer->colorspace != MODE_BGR);
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const rgba = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const uint32_t bytes_per_px = has_alpha ? 4 : 3;
+  uint32_t y;
+  const uint32_t line_size = bytes_per_px * width;
+  const uint32_t bmp_stride = (line_size + 3) & ~3;   // pad to 4
+  const uint32_t total_size = bmp_stride * height + BMP_HEADER_SIZE;
+  uint8_t bmp_header[BMP_HEADER_SIZE] = { 0 };
+
+  // bitmap file header
+  PutLE16(bmp_header + 0, 0x4d42);                // signature 'BM'
+  PutLE32(bmp_header + 2, total_size);            // size including header
+  PutLE32(bmp_header + 6, 0);                     // reserved
+  PutLE32(bmp_header + 10, BMP_HEADER_SIZE);      // offset to pixel array
+  // bitmap info header
+  PutLE32(bmp_header + 14, 40);                   // DIB header size
+  PutLE32(bmp_header + 18, width);                // dimensions
+  PutLE32(bmp_header + 22, -(int)height);         // vertical flip!
+  PutLE16(bmp_header + 26, 1);                    // number of planes
+  PutLE16(bmp_header + 28, bytes_per_px * 8);     // bits per pixel
+  PutLE32(bmp_header + 30, 0);                    // no compression (BI_RGB)
+  PutLE32(bmp_header + 34, 0);                    // image size (dummy)
+  PutLE32(bmp_header + 38, 2400);                 // x pixels/meter
+  PutLE32(bmp_header + 42, 2400);                 // y pixels/meter
+  PutLE32(bmp_header + 46, 0);                    // number of palette colors
+  PutLE32(bmp_header + 50, 0);                    // important color count
+
+  // TODO(skal): color profile
+
+  // write header
+  if (fwrite(bmp_header, sizeof(bmp_header), 1, fout) != 1) {
+    return 0;
  }
-  for (y = 0; ok && y < uv_height; ++y) {
-    ok &= (fwrite(u + y * uv_stride, uv_width, 1, fout) == 1);
-    ok &= (fwrite(v + y * uv_stride, uv_width, 1, fout) == 1);
+
+  // write pixel array
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgba + y * stride, line_size, 1, fout) != 1) {
+      return 0;
+    }
+    // write padding zeroes
+    if (bmp_stride != line_size) {
+      const uint8_t zeroes[3] = { 0 };
+      if (fwrite(zeroes, bmp_stride - line_size, 1, fout) != 1) {
+        return 0;
+      }
+    }
+  }
+  return 1;
+}
+#undef BMP_HEADER_SIZE
+
+#define NUM_IFD_ENTRIES 15
+#define EXTRA_DATA_SIZE 16
+// 10b for signature/header + n * 12b entries + 4b for IFD terminator:
+#define EXTRA_DATA_OFFSET (10 + 12 * NUM_IFD_ENTRIES + 4)
+#define TIFF_HEADER_SIZE (EXTRA_DATA_OFFSET + EXTRA_DATA_SIZE)
+
+static int WriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int has_alpha = (buffer->colorspace != MODE_RGB);
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const rgba = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
+  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
+  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
+                                            : NUM_IFD_ENTRIES - 1;
+  uint8_t tiff_header[TIFF_HEADER_SIZE] = {
+    0x49, 0x49, 0x2a, 0x00,   // little endian signature
+    8, 0, 0, 0,               // offset to the unique IFD that follows
+    // IFD (offset = 8). Entries must be written in increasing tag order.
+    num_ifd_entries, 0,       // Number of entries in the IFD (12 bytes each).
+    0x00, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  10: Width  (TBD)
+    0x01, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  22: Height (TBD)
+    0x02, 0x01, 3, 0, bytes_per_px, 0, 0, 0,     //  34: BitsPerSample: 8888
+        EXTRA_DATA_OFFSET + 0, 0, 0, 0,
+    0x03, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  46: Compression: none
+    0x06, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    //  58: Photometric: RGB
+    0x11, 0x01, 4, 0, 1, 0, 0, 0,                //  70: Strips offset:
+        TIFF_HEADER_SIZE, 0, 0, 0,               //      data follows header
+    0x12, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  82: Orientation: topleft
+    0x15, 0x01, 3, 0, 1, 0, 0, 0,                //  94: SamplesPerPixels
+        bytes_per_px, 0, 0, 0,
+    0x16, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 106: Rows per strip (TBD)
+    0x17, 0x01, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 118: StripByteCount (TBD)
+    0x1a, 0x01, 5, 0, 1, 0, 0, 0,                // 130: X-resolution
+        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
+    0x1b, 0x01, 5, 0, 1, 0, 0, 0,                // 142: Y-resolution
+        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
+    0x1c, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 154: PlanarConfiguration
+    0x28, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    // 166: ResolutionUnit (inch)
+    0x52, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 178: ExtraSamples: rgbA
+    0, 0, 0, 0,                                  // 190: IFD terminator
+    // EXTRA_DATA_OFFSET:
+    8, 0, 8, 0, 8, 0, 8, 0,      // BitsPerSample
+    72, 0, 0, 0, 1, 0, 0, 0      // 72 pixels/inch, for X/Y-resolution
+  };
+  uint32_t y;
+
+  // Fill placeholders in IFD:
+  PutLE32(tiff_header + 10 + 8, width);
+  PutLE32(tiff_header + 22 + 8, height);
+  PutLE32(tiff_header + 106 + 8, height);
+  PutLE32(tiff_header + 118 + 8, width * bytes_per_px * height);
+  if (!has_alpha) PutLE32(tiff_header + 178, 0);  // IFD terminator
+
+  // write header
+  if (fwrite(tiff_header, sizeof(tiff_header), 1, fout) != 1) {
+    return 0;
+  }
+  // write pixel values
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgba + y * stride, bytes_per_px, width, fout) != width) {
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+#undef TIFF_HEADER_SIZE
+#undef EXTRA_DATA_OFFSET
+#undef EXTRA_DATA_SIZE
+#undef NUM_IFD_ENTRIES
+
+static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const a = buffer->u.YUVA.a;
+  const int a_stride = buffer->u.YUVA.a_stride;
+  uint32_t y;
+  assert(a != NULL);
+  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  for (y = 0; y < height; ++y) {
+    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// format=PGM: save a grayscale PGM file using the IMC4 layout
+// (http://www.fourcc.org/yuv.php#IMC4). This is a very convenient format for
+// viewing the samples, esp. for odd dimensions.
+// format=YUV: just save the Y/U/V/A planes sequentially without header.
+static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
+                         OutputFileFormat format) {
+  const int width = buffer->width;
+  const int height = buffer->height;
+  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
+  int ok = 1;
+  int y;
+  const int pad = (format == YUV) ? 0 : 1;
+  const int uv_width = (width + 1) / 2;
+  const int uv_height = (height + 1) / 2;
+  const int out_stride = (width + pad) & ~pad;
+  const int a_height = yuv->a ? height : 0;
+  if (format == PGM) {
+    fprintf(fout, "P5\n%d %d\n255\n",
+            out_stride, height + uv_height + a_height);
+  }
+  for (y = 0; ok && y < height; ++y) {
+    ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
+    if (format == PGM) {
+      if (width & 1) fputc(0, fout);    // padding byte
+    }
+  }
+  if (format == PGM) {   // IMC4 layout
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+      ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+    }
+  } else {
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+    }
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+    }
+  }
+  for (y = 0; ok && y < a_height; ++y) {
+    ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
+    if (format == PGM) {
+      if (width & 1) fputc(0, fout);    // padding byte
+    }
  }
  return ok;
 }

-typedef enum {
-  PNG = 0,
-  PPM,
-  PGM,
-} OutputFileFormat;
+static int SaveOutput(const WebPDecBuffer* const buffer,
+                      OutputFileFormat format, const char* const out_file) {
+  FILE* fout = NULL;
+  int needs_open_file = 1;
+  const int use_stdout = !strcmp(out_file, "-");
+  int ok = 1;
+  Stopwatch stop_watch;
+
+  if (verbose) {
+    StopwatchReset(&stop_watch);
+  }
+
+#ifdef HAVE_WINCODEC_H
+  needs_open_file = (format != PNG);
+#endif
+
+  if (needs_open_file) {
+    fout = use_stdout ? ExUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
+    if (fout == NULL) {
+      fprintf(stderr, "Error opening output file %s\n", out_file);
+      return 0;
+    }
+  }
+
+  if (format == PNG) {
+#ifdef HAVE_WINCODEC_H
+    ok &= WritePNG(out_file, use_stdout, buffer);
+#else
+    ok &= WritePNG(fout, buffer);
+#endif
+  } else if (format == PAM) {
+    ok &= WritePPM(fout, buffer, 1);
+  } else if (format == PPM) {
+    ok &= WritePPM(fout, buffer, 0);
+  } else if (format == BMP) {
+    ok &= WriteBMP(fout, buffer);
+  } else if (format == TIFF) {
+    ok &= WriteTIFF(fout, buffer);
+  } else if (format == PGM || format == YUV) {
+    ok &= WritePGMOrYUV(fout, buffer, format);
+  } else if (format == ALPHA_PLANE_ONLY) {
+    ok &= WriteAlphaPlane(fout, buffer);
+  }
+  if (fout != NULL && fout != stdout) {
+    fclose(fout);
+  }
+  if (ok) {
+    if (use_stdout) {
+      fprintf(stderr, "Saved to stdout\n");
+    } else {
+      fprintf(stderr, "Saved file %s\n", out_file);
+    }
+    if (verbose) {
+      const double write_time = StopwatchReadAndReset(&stop_watch);
+      fprintf(stderr, "Time to write output: %.3fs\n", write_time);
+    }
+  } else {
+    if (use_stdout) {
+      fprintf(stderr, "Error writing to stdout !!\n");
+    } else {
+      fprintf(stderr, "Error writing file %s !!\n", out_file);
+    }
+  }
+  return ok;
+}

 static void Help(void) {
-  printf("Usage: dwebp "
-         "[in_file] [-h] [-v] [-ppm] [-pgm] [-version] [-o out_file]\n\n"
+  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
         "Decodes the WebP image file to PNG format [Default]\n"
         "Use following options to convert into alternate image formats:\n"
-         " -ppm:  save the raw RGB samples as color PPM\n"
-         " -pgm:  save the raw YUV samples as a grayscale PGM\n"
-         "        file with IMC4 layout.\n"
-         " -version: print version number and exit.\n"
-         "Use -v for verbose (e.g. print encoding/decoding times)\n"
+         "  -pam ......... save the raw RGBA samples as a color PAM\n"
+         "  -ppm ......... save the raw RGB samples as a color PPM\n"
+         "  -bmp ......... save as uncompressed BMP format\n"
+         "  -tiff ........ save as uncompressed TIFF format\n"
+         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
+         "                 file with IMC4 layout\n"
+         "  -yuv ......... save the raw YUV samples in flat layout\n"
+         "\n"
+         " Other options are:\n"
+         "  -version  .... print version number and exit\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
+         "  -nofilter .... disable in-loop filtering\n"
+         "  -nodither .... disable dithering\n"
+         "  -dither <d> .. dithering strength (in 0..100)\n"
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+         "  -alpha_dither  use alpha-plane dithering if needed\n"
+#endif
+         "  -mt .......... use multi-threading\n"
+         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
+         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+         "  -flip ........ flip the output vertically\n"
+#endif
+         "  -alpha ....... only save the alpha plane\n"
+         "  -incremental . use incremental decoding (useful for tests)\n"
+         "  -h     ....... this help message\n"
+         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
+#ifndef WEBP_DLL
+         "  -noasm ....... disable all assembly optimizations\n"
+#endif
        );
 }

+static const char* const kFormatType[] = {
+  "unspecified", "lossy", "lossless"
+};
+
 int main(int argc, const char *argv[]) {
+  int ok = 0;
  const char *in_file = NULL;
  const char *out_file = NULL;

-  int width, height, stride, uv_stride;
-  uint8_t* out = NULL, *u = NULL, *v = NULL;
+  WebPDecoderConfig config;
+  WebPDecBuffer* const output_buffer = &config.output;
+  WebPBitstreamFeatures* const bitstream = &config.input;
  OutputFileFormat format = PNG;
-  Stopwatch stop_watch;
+  int incremental = 0;
  int c;
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return -1;
+  }
+
  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
      out_file = argv[++c];
+    } else if (!strcmp(argv[c], "-alpha")) {
+      format = ALPHA_PLANE_ONLY;
+    } else if (!strcmp(argv[c], "-nofancy")) {
+      config.options.no_fancy_upsampling = 1;
+    } else if (!strcmp(argv[c], "-nofilter")) {
+      config.options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-pam")) {
+      format = PAM;
    } else if (!strcmp(argv[c], "-ppm")) {
      format = PPM;
+    } else if (!strcmp(argv[c], "-bmp")) {
+      format = BMP;
+    } else if (!strcmp(argv[c], "-tiff")) {
+      format = TIFF;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
-        (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-pgm")) {
      format = PGM;
+    } else if (!strcmp(argv[c], "-yuv")) {
+      format = YUV;
+    } else if (!strcmp(argv[c], "-mt")) {
+      config.options.use_threads = 1;
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+    } else if (!strcmp(argv[c], "-alpha_dither")) {
+      config.options.alpha_dithering_strength = 100;
+#endif
+    } else if (!strcmp(argv[c], "-nodither")) {
+      config.options.dithering_strength = 0;
+    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
+      config.options.dithering_strength =
+          ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
+      config.options.use_cropping = 1;
+      config.options.crop_left   = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.crop_top    = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.crop_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.crop_height = ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
+      config.options.use_scaling = 1;
+      config.options.scaled_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.scaled_height = ExUtilGetInt(argv[++c], 0, &parse_error);
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+    } else if (!strcmp(argv[c], "-flip")) {
+      config.options.flip = 1;
+#endif
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
+#ifndef WEBP_DLL
+    } else if (!strcmp(argv[c], "-noasm")) {
+      VP8GetCPUInfo = NULL;
+#endif
+    } else if (!strcmp(argv[c], "-incremental")) {
+      incremental = 1;
+    } else if (!strcmp(argv[c], "--")) {
+      if (c < argc - 1) in_file = argv[++c];
+      break;
    } else if (argv[c][0] == '-') {
-      printf("Unknown option '%s'\n", argv[c]);
+      fprintf(stderr, "Unknown option '%s'\n", argv[c]);
      Help();
      return -1;
    } else {
      in_file = argv[c];
    }
+
+    if (parse_error) {
+      Help();
+      return -1;
+    }
  }

  if (in_file == NULL) {
-    printf("missing input file!!\n");
+    fprintf(stderr, "missing input file!!\n");
    Help();
    return -1;
  }

  {
-    uint32_t data_size = 0;
-    void* data = NULL;
-    int ok;
-    FILE* const in = fopen(in_file, "rb");
-    if (!in) {
-      fprintf(stderr, "cannot open input file '%s'\n", in_file);
-      return 1;
-    }
-    fseek(in, 0, SEEK_END);
-    data_size = ftell(in);
-    fseek(in, 0, SEEK_SET);
-    data = malloc(data_size);
-    ok = (fread(data, data_size, 1, in) == 1);
-    fclose(in);
-    if (!ok) {
-      free(data);
+    VP8StatusCode status = VP8_STATUS_OK;
+    size_t data_size = 0;
+    const uint8_t* data = NULL;
+    if (!ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
      return -1;
    }

-    if (verbose)
-      StopwatchReadAndReset(&stop_watch);
    switch (format) {
      case PNG:
-#ifdef _WIN32
-        out = WebPDecodeBGR((const uint8_t*)data, data_size, &width, &height);
+#ifdef HAVE_WINCODEC_H
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
 #else
-        out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
        break;
+      case PAM:
+        output_buffer->colorspace = MODE_RGBA;
+        break;
      case PPM:
-        out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
+        break;
+      case BMP:
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
+        break;
+      case TIFF:    // note: force pre-multiplied alpha
+        output_buffer->colorspace =
+            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
        break;
      case PGM:
-        out = WebPDecodeYUV((const uint8_t*)data, data_size, &width, &height,
-                            &u, &v, &stride, &uv_stride);
+      case YUV:
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
+        break;
+      case ALPHA_PLANE_ONLY:
+        output_buffer->colorspace = MODE_YUVA;
        break;
      default:
-        free(data);
+        free((void*)data);
        return -1;
    }

-    if (verbose) {
-      const double time = StopwatchReadAndReset(&stop_watch);
-      printf("Time to decode picture: %.3fs\n", time);
-    }
-
-    free(data);
-  }
-
-  if (!out) {
-    fprintf(stderr, "Decoding of %s failed.\n", in_file);
-    return -1;
-  }
-
-  if (out_file) {
-    FILE* fout = NULL;
-    int needs_open_file = 0;
-
-    printf("Decoded %s. Dimensions: %d x %d. Now saving...\n", in_file, width, height);
-    StopwatchReadAndReset(&stop_watch);
-#ifdef _WIN32
-    if (format != PNG) {
-      needs_open_file = 1;
-    }
-#else
-    needs_open_file = 1;
-#endif
-    if (needs_open_file) fout = fopen(out_file, "wb");
-    if (!needs_open_file || fout) {
-      int ok = 1;
-      if (format == PNG) {
-#ifdef _WIN32
-        ok &= WritePNG(out_file, out, 3 * width, width, height);
-#else
-        ok &= WritePNG(fout, out, 3 * width, width, height);
-#endif
-      } else if (format == PPM) {
-        ok &= WritePPM(fout, out, width, height);
-      } else if (format == PGM) {
-        ok &= WritePGM(fout, out, u, v, stride, uv_stride, width, height);
-      }
-      if (fout)
-        fclose(fout);
-      if (ok) {
-        printf("Saved file %s\n", out_file);
-        if (verbose) {
-          const double time = StopwatchReadAndReset(&stop_watch);
-          printf("Time to write output: %.3fs\n", time);
-        }
-      } else {
-        fprintf(stderr, "Error writing file %s !!\n", out_file);
-      }
+    if (incremental) {
+      status = ExUtilDecodeWebPIncremental(data, data_size, verbose, &config);
    } else {
-      fprintf(stderr, "Error opening output file %s\n", out_file);
+      status = ExUtilDecodeWebP(data, data_size, verbose, &config);
    }
-  } else {
-    printf("File %s can be decoded (dimensions: %d x %d).\n",
-           in_file, width, height);
-    printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
-  }
-  free(out);

-  return 0;
+    free((void*)data);
+    ok = (status == VP8_STATUS_OK);
+    if (!ok) {
+      ExUtilPrintWebPError(in_file, status);
+      goto Exit;
+    }
+  }
+
+  if (out_file != NULL) {
+    fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
+                    "Now saving...\n",
+            in_file, output_buffer->width, output_buffer->height,
+            bitstream->has_alpha ? " (with alpha)" : "",
+            kFormatType[bitstream->format]);
+    ok = SaveOutput(output_buffer, format, out_file);
+  } else {
+    fprintf(stderr, "File %s can be decoded "
+                    "(dimensions: %d x %d %s. Format: %s).\n",
+            in_file, output_buffer->width, output_buffer->height,
+            bitstream->has_alpha ? " (with alpha)" : "",
+            kFormatType[bitstream->format]);
+    fprintf(stderr, "Nothing written; "
+                    "use -o flag to save the result as e.g. PNG.\n");
+  }
+ Exit:
+  WebPFreeDecBuffer(output_buffer);
+  return ok ? 0 : -1;
 }

-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+//------------------------------------------------------------------------------
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -0,0 +1,258 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Utility functions used by the example programs.
+//
+
+#include "./example_util.h"
+
+#if defined(_WIN32)
+#include <fcntl.h>   // for _O_BINARY
+#include <io.h>      // for _setmode()
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/decode.h"
+#include "./stopwatch.h"
+
+//------------------------------------------------------------------------------
+// String parsing
+
+uint32_t ExUtilGetUInt(const char* const v, int base, int* const error) {
+  char* end = NULL;
+  const uint32_t n = (v != NULL) ? (uint32_t)strtoul(v, &end, base) : 0u;
+  if (end == v && error != NULL && !*error) {
+    *error = 1;
+    fprintf(stderr, "Error! '%s' is not an integer.\n",
+            (v != NULL) ? v : "(null)");
+  }
+  return n;
+}
+
+int ExUtilGetInt(const char* const v, int base, int* const error) {
+  return (int)ExUtilGetUInt(v, base, error);
+}
+
+float ExUtilGetFloat(const char* const v, int* const error) {
+  char* end = NULL;
+  const float f = (v != NULL) ? (float)strtod(v, &end) : 0.f;
+  if (end == v && error != NULL && !*error) {
+    *error = 1;
+    fprintf(stderr, "Error! '%s' is not a floating point number.\n",
+            (v != NULL) ? v : "(null)");
+  }
+  return f;
+}
+
+// -----------------------------------------------------------------------------
+// File I/O
+
+FILE* ExUtilSetBinaryMode(FILE* file) {
+#if defined(_WIN32)
+  if (_setmode(_fileno(file), _O_BINARY) == -1) {
+    fprintf(stderr, "Failed to reopen file in O_BINARY mode.\n");
+    return NULL;
+  }
+#endif
+  return file;
+}
+
+int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
+  static const size_t kBlockSize = 16384;  // default initial size
+  size_t max_size = 0;
+  size_t size = 0;
+  uint8_t* input = NULL;
+
+  if (data == NULL || data_size == NULL) return 0;
+  *data = NULL;
+  *data_size = 0;
+
+  if (!ExUtilSetBinaryMode(stdin)) return 0;
+
+  while (!feof(stdin)) {
+    // We double the buffer size each time and read as much as possible.
+    const size_t extra_size = (max_size == 0) ? kBlockSize : max_size;
+    void* const new_data = realloc(input, max_size + extra_size);
+    if (new_data == NULL) goto Error;
+    input = (uint8_t*)new_data;
+    max_size += extra_size;
+    size += fread(input + size, 1, extra_size, stdin);
+    if (size < max_size) break;
+  }
+  if (ferror(stdin)) goto Error;
+  *data = input;
+  *data_size = size;
+  return 1;
+
+ Error:
+  free(input);
+  fprintf(stderr, "Could not read from stdin\n");
+  return 0;
+}
+
+int ExUtilReadFile(const char* const file_name,
+                   const uint8_t** data, size_t* data_size) {
+  int ok;
+  void* file_data;
+  size_t file_size;
+  FILE* in;
+  const int from_stdin = (file_name == NULL) || !strcmp(file_name, "-");
+
+  if (from_stdin) return ExUtilReadFromStdin(data, data_size);
+
+  if (data == NULL || data_size == NULL) return 0;
+  *data = NULL;
+  *data_size = 0;
+
+  in = fopen(file_name, "rb");
+  if (in == NULL) {
+    fprintf(stderr, "cannot open input file '%s'\n", file_name);
+    return 0;
+  }
+  fseek(in, 0, SEEK_END);
+  file_size = ftell(in);
+  fseek(in, 0, SEEK_SET);
+  file_data = malloc(file_size);
+  if (file_data == NULL) return 0;
+  ok = (fread(file_data, file_size, 1, in) == 1);
+  fclose(in);
+
+  if (!ok) {
+    fprintf(stderr, "Could not read %d bytes of data from file %s\n",
+            (int)file_size, file_name);
+    free(file_data);
+    return 0;
+  }
+  *data = (uint8_t*)file_data;
+  *data_size = file_size;
+  return 1;
+}
+
+int ExUtilWriteFile(const char* const file_name,
+                    const uint8_t* data, size_t data_size) {
+  int ok;
+  FILE* out;
+  const int to_stdout = (file_name == NULL) || !strcmp(file_name, "-");
+
+  if (data == NULL) {
+    return 0;
+  }
+  out = to_stdout ? stdout : fopen(file_name, "wb");
+  if (out == NULL) {
+    fprintf(stderr, "Error! Cannot open output file '%s'\n", file_name);
+    return 0;
+  }
+  ok = (fwrite(data, data_size, 1, out) == 1);
+  if (out != stdout) fclose(out);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// WebP decoding
+
+static const char* const kStatusMessages[VP8_STATUS_NOT_ENOUGH_DATA + 1] = {
+  "OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
+  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
+};
+
+static void PrintAnimationWarning(const WebPDecoderConfig* const config) {
+  if (config->input.has_animation) {
+    fprintf(stderr,
+            "Error! Decoding of an animated WebP file is not supported.\n"
+            "       Use webpmux to extract the individual frames or\n"
+            "       vwebp to view this image.\n");
+  }
+}
+
+void ExUtilPrintWebPError(const char* const in_file, int status) {
+  fprintf(stderr, "Decoding of %s failed.\n", in_file);
+  fprintf(stderr, "Status: %d", status);
+  if (status >= VP8_STATUS_OK && status <= VP8_STATUS_NOT_ENOUGH_DATA) {
+    fprintf(stderr, "(%s)", kStatusMessages[status]);
+  }
+  fprintf(stderr, "\n");
+}
+
+int ExUtilLoadWebP(const char* const in_file,
+                   const uint8_t** data, size_t* data_size,
+                   WebPBitstreamFeatures* bitstream) {
+  VP8StatusCode status;
+  WebPBitstreamFeatures local_features;
+  if (!ExUtilReadFile(in_file, data, data_size)) return 0;
+
+  if (bitstream == NULL) {
+    bitstream = &local_features;
+  }
+
+  status = WebPGetFeatures(*data, *data_size, bitstream);
+  if (status != VP8_STATUS_OK) {
+    free((void*)*data);
+    *data = NULL;
+    *data_size = 0;
+    ExUtilPrintWebPError(in_file, status);
+    return 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+VP8StatusCode ExUtilDecodeWebP(const uint8_t* const data, size_t data_size,
+                               int verbose, WebPDecoderConfig* const config) {
+  Stopwatch stop_watch;
+  VP8StatusCode status = VP8_STATUS_OK;
+  if (config == NULL) return VP8_STATUS_INVALID_PARAM;
+
+  PrintAnimationWarning(config);
+
+  StopwatchReset(&stop_watch);
+
+  // Decoding call.
+  status = WebPDecode(data, data_size, config);
+
+  if (verbose) {
+    const double decode_time = StopwatchReadAndReset(&stop_watch);
+    fprintf(stderr, "Time to decode picture: %.3fs\n", decode_time);
+  }
+  return status;
+}
+
+VP8StatusCode ExUtilDecodeWebPIncremental(
+    const uint8_t* const data, size_t data_size,
+    int verbose, WebPDecoderConfig* const config) {
+  Stopwatch stop_watch;
+  VP8StatusCode status = VP8_STATUS_OK;
+  if (config == NULL) return VP8_STATUS_INVALID_PARAM;
+
+  PrintAnimationWarning(config);
+
+  StopwatchReset(&stop_watch);
+
+  // Decoding call.
+  {
+    WebPIDecoder* const idec = WebPIDecode(data, data_size, config);
+    if (idec == NULL) {
+      fprintf(stderr, "Failed during WebPINewDecoder().\n");
+      return VP8_STATUS_OUT_OF_MEMORY;
+    } else {
+      status = WebPIUpdate(idec, data, data_size);
+      WebPIDelete(idec);
+    }
+  }
+
+  if (verbose) {
+    const double decode_time = StopwatchReadAndReset(&stop_watch);
+    fprintf(stderr, "Time to decode picture: %.3fs\n", decode_time);
+  }
+  return status;
+}
+
+// -----------------------------------------------------------------------------
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -0,0 +1,89 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Utility functions used by the example programs.
+//
+
+#ifndef WEBP_EXAMPLES_EXAMPLE_UTIL_H_
+#define WEBP_EXAMPLES_EXAMPLE_UTIL_H_
+
+#include <stdio.h>
+#include "webp/decode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// String parsing
+
+// Parses 'v' using strto(ul|l|d)(). If error is non-NULL, '*error' is set to
+// true on failure while on success it is left unmodified to allow chaining of
+// calls. An error is only printed on the first occurrence.
+uint32_t ExUtilGetUInt(const char* const v, int base, int* const error);
+int ExUtilGetInt(const char* const v, int base, int* const error);
+float ExUtilGetFloat(const char* const v, int* const error);
+
+//------------------------------------------------------------------------------
+// File I/O
+
+// Reopen file in binary (O_BINARY) mode.
+// Returns 'file' on success, NULL otherwise.
+FILE* ExUtilSetBinaryMode(FILE* file);
+
+// Allocates storage for entire file 'file_name' and returns contents and size
+// in 'data' and 'data_size'. Returns 1 on success, 0 otherwise. '*data' should
+// be deleted using free().
+// If 'file_name' is NULL or equal to "-", input is read from stdin by calling
+// the function ExUtilReadFromStdin().
+int ExUtilReadFile(const char* const file_name,
+                   const uint8_t** data, size_t* data_size);
+
+// Same as ExUtilReadFile(), but reads until EOF from stdin instead.
+int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size);
+
+// Write a data segment into a file named 'file_name'. Returns true if ok.
+// If 'file_name' is NULL or equal to "-", output is written to stdout.
+int ExUtilWriteFile(const char* const file_name,
+                    const uint8_t* data, size_t data_size);
+
+//------------------------------------------------------------------------------
+// WebP decoding
+
+// Prints an informative error message regarding decode failure of 'in_file'.
+// 'status' is treated as a VP8StatusCode and if valid will be printed as a
+// text string.
+void ExUtilPrintWebPError(const char* const in_file, int status);
+
+// Reads a WebP from 'in_file', returning the contents and size in 'data' and
+// 'data_size'. If not NULL, 'bitstream' is populated using WebPGetFeatures().
+// Returns true on success.
+int ExUtilLoadWebP(const char* const in_file,
+                   const uint8_t** data, size_t* data_size,
+                   WebPBitstreamFeatures* bitstream);
+
+// Decodes the WebP contained in 'data'.
+// 'config' is a structure previously initialized by WebPInitDecoderConfig().
+// 'config->output' should have the desired colorspace selected. 'verbose' will
+// cause decode timing to be reported.
+// Returns the decoder status. On success 'config->output' will contain the
+// decoded picture.
+VP8StatusCode ExUtilDecodeWebP(const uint8_t* const data, size_t data_size,
+                               int verbose, WebPDecoderConfig* const config);
+
+// Same as ExUtilDecodeWebP(), but using the incremental decoder.
+VP8StatusCode ExUtilDecodeWebPIncremental(
+    const uint8_t* const data, size_t data_size,
+    int verbose, WebPDecoderConfig* const config);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_EXAMPLE_UTIL_H_
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -0,0 +1,732 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  simple tool to convert animated GIFs to WebP
+//
+// Authors: Skal (pascal.massimino@gmail.com)
+//          Urvang (urvang@google.com)
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#ifdef WEBP_HAVE_GIF
+
+#include <gif_lib.h>
+#include "webp/encode.h"
+#include "webp/mux.h"
+#include "./example_util.h"
+#include "./gif2webp_util.h"
+
+// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
+# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
+# define LOCAL_GIF_PREREQ(maj, min) \
+    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GIF_VERSION 0
+# define LOCAL_GIF_PREREQ(maj, min) 0
+#endif
+
+#define GIF_TRANSPARENT_MASK 0x01
+#define GIF_DISPOSE_MASK     0x07
+#define GIF_DISPOSE_SHIFT    2
+#define WHITE_COLOR          0xffffffff
+#define MAX_CACHE_SIZE       30
+
+//------------------------------------------------------------------------------
+
+static int transparent_index;  // Index of transparent color in the map.
+
+static void ResetFrameInfo(WebPMuxFrameInfo* const info) {
+  WebPDataInit(&info->bitstream);
+  info->x_offset = 0;
+  info->y_offset = 0;
+  info->duration = 0;
+  info->id = WEBP_CHUNK_ANMF;
+  info->dispose_method = WEBP_MUX_DISPOSE_NONE;
+  info->blend_method = WEBP_MUX_BLEND;
+  transparent_index = -1;  // Opaque frame by default.
+}
+
+static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
+                                      size_t* const kmax_ptr) {
+  size_t kmin = *kmin_ptr;
+  size_t kmax = *kmax_ptr;
+  int print_warning = 1;
+
+  if (kmin == 0) {  // Disable keyframe insertion.
+    kmax = ~0;
+    kmin = kmax - 1;
+    print_warning = 0;
+  }
+  if (kmax == 0) {
+    kmax = ~0;
+    print_warning = 0;
+  }
+
+  if (kmin >= kmax) {
+    kmin = kmax - 1;
+    if (print_warning) {
+      fprintf(stderr,
+              "WARNING: Setting kmin = %d, so that kmin < kmax.\n", (int)kmin);
+    }
+  } else if (kmin < (kmax / 2 + 1)) {
+    // This ensures that cache.keyframe + kmin >= kmax is always true. So, we
+    // can flush all the frames in the ‘count_since_key_frame == kmax’ case.
+    kmin = (kmax / 2 + 1);
+    if (print_warning) {
+      fprintf(stderr,
+              "WARNING: Setting kmin = %d, so that kmin >= kmax / 2 + 1.\n",
+              (int)kmin);
+    }
+  }
+  // Limit the max number of frames that are allocated.
+  if (kmax - kmin > MAX_CACHE_SIZE) {
+    kmin = kmax - MAX_CACHE_SIZE;
+    if (print_warning) {
+      fprintf(stderr,
+              "WARNING: Setting kmin = %d, so that kmax - kmin <= 30.\n",
+              (int)kmin);
+    }
+  }
+  *kmin_ptr = kmin;
+  *kmax_ptr = kmax;
+}
+
+static void Remap(const uint8_t* const src, const GifFileType* const gif,
+                  uint32_t* dst, int len) {
+  int i;
+  const GifColorType* colors;
+  const ColorMapObject* const cmap =
+      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
+  if (cmap == NULL) return;
+  colors = cmap->Colors;
+
+  for (i = 0; i < len; ++i) {
+    const GifColorType c = colors[src[i]];
+    dst[i] = (src[i] == transparent_index) ? WEBP_UTIL_TRANSPARENT_COLOR
+           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
+  }
+}
+
+// Read the GIF image frame.
+static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
+                     WebPPicture* const webp_frame) {
+  WebPPicture sub_image;
+  const GifImageDesc* const image_desc = &gif->Image;
+  uint32_t* dst = NULL;
+  uint8_t* tmp = NULL;
+  int ok = 0;
+  WebPFrameRect rect = {
+      image_desc->Left, image_desc->Top, image_desc->Width, image_desc->Height
+  };
+  *gif_rect = rect;
+
+  // Use a view for the sub-picture:
+  if (!WebPPictureView(webp_frame, rect.x_offset, rect.y_offset,
+                       rect.width, rect.height, &sub_image)) {
+    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
+            rect.width, rect.height, rect.x_offset, rect.y_offset);
+    return 0;
+  }
+  dst = sub_image.argb;
+
+  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
+  if (tmp == NULL) goto End;
+
+  if (image_desc->Interlace) {  // Interlaced image.
+    // We need 4 passes, with the following offsets and jumps.
+    const int interlace_offsets[] = { 0, 4, 2, 1 };
+    const int interlace_jumps[]   = { 8, 8, 4, 2 };
+    int pass;
+    for (pass = 0; pass < 4; ++pass) {
+      int y;
+      for (y = interlace_offsets[pass]; y < rect.height;
+           y += interlace_jumps[pass]) {
+        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+        Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
+      }
+    }
+  } else {  // Non-interlaced image.
+    int y;
+    for (y = 0; y < rect.height; ++y) {
+      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+      Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
+    }
+  }
+  ok = 1;
+
+ End:
+  if (!ok) webp_frame->error_code = sub_image.error_code;
+  WebPPictureFree(&sub_image);
+  free(tmp);
+  return ok;
+}
+
+static void GetBackgroundColor(const ColorMapObject* const color_map,
+                               int bgcolor_idx, uint32_t* const bgcolor) {
+  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
+    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || bgcolor_idx >= color_map->ColorCount) {
+    *bgcolor = WHITE_COLOR;
+    fprintf(stderr,
+            "GIF decode warning: invalid background color index. Assuming "
+            "white background.\n");
+  } else {
+    const GifColorType color = color_map->Colors[bgcolor_idx];
+    *bgcolor = (0xff        << 24)
+             | (color.Red   << 16)
+             | (color.Green <<  8)
+             | (color.Blue  <<  0);
+  }
+}
+
+static void DisplayGifError(const GifFileType* const gif, int gif_error) {
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if LOCAL_GIF_PREREQ(4,2)
+#if LOCAL_GIF_PREREQ(5,0)
+  // Static string actually, hence the const char* cast.
+  const char* error_str = (const char*)GifErrorString(
+      (gif == NULL) ? gif_error : gif->Error);
+#else
+  const char* error_str = (const char*)GifErrorString();
+  (void)gif;
+#endif
+  if (error_str == NULL) error_str = "Unknown error";
+  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+  (void)gif;
+  fprintf(stderr, "GIFLib Error %d: ", gif_error);
+  PrintGifError();
+  fprintf(stderr, "\n");
+#endif
+}
+
+static const char* const kErrorMessages[-WEBP_MUX_NOT_ENOUGH_DATA + 1] = {
+  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
+  "WEBP_MUX_MEMORY_ERROR", "WEBP_MUX_NOT_ENOUGH_DATA"
+};
+
+static const char* ErrorString(WebPMuxError err) {
+  assert(err <= WEBP_MUX_NOT_FOUND && err >= WEBP_MUX_NOT_ENOUGH_DATA);
+  return kErrorMessages[-err];
+}
+
+enum {
+  METADATA_ICC  = (1 << 0),
+  METADATA_XMP  = (1 << 1),
+  METADATA_ALL  = METADATA_ICC | METADATA_XMP
+};
+
+//------------------------------------------------------------------------------
+
+static void Help(void) {
+  printf("Usage:\n");
+  printf(" gif2webp [options] gif_file -o webp_file\n");
+  printf("Options:\n");
+  printf("  -h / -help  ............ this help\n");
+  printf("  -lossy ................. encode image using lossy compression\n");
+  printf("  -mixed ................. for each frame in the image, pick lossy\n"
+         "                           or lossless compression heuristically\n");
+  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
+  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
+  printf("  -kmin <int> ............ min distance between key frames\n");
+  printf("  -kmax <int> ............ max distance between key frames\n");
+  printf("  -f <int> ............... filter strength (0=off..100)\n");
+  printf("  -metadata <string> ..... comma separated list of metadata to\n");
+  printf("                           ");
+  printf("copy from the input to the output if present\n");
+  printf("                           "
+         "Valid values: all, none, icc, xmp (default)\n");
+  printf("  -mt .................... use multi-threading if available\n");
+  printf("\n");
+  printf("  -version ............... print version number and exit\n");
+  printf("  -v ..................... verbose\n");
+  printf("  -quiet ................. don't print anything\n");
+  printf("\n");
+}
+
+//------------------------------------------------------------------------------
+
+int main(int argc, const char *argv[]) {
+  int verbose = 0;
+  int gif_error = GIF_ERROR;
+  WebPMuxError err = WEBP_MUX_OK;
+  int ok = 0;
+  const char *in_file = NULL, *out_file = NULL;
+  FILE* out = NULL;
+  GifFileType* gif = NULL;
+  WebPConfig config;
+  WebPPicture frame;
+  WebPMuxFrameInfo info;
+  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
+  WebPFrameCache* cache = NULL;
+
+  int is_first_frame = 1;     // Whether we are processing the first frame.
+  int done;
+  int c;
+  int quiet = 0;
+  WebPMux* mux = NULL;
+  WebPData webp_data = { NULL, 0 };
+  int keep_metadata = METADATA_XMP;  // ICC not output by default.
+  int stored_icc = 0;  // Whether we have already stored an ICC profile.
+  int stored_xmp = 0;
+
+  int default_kmin = 1;  // Whether to use default kmin value.
+  int default_kmax = 1;
+  size_t kmin = 0;
+  size_t kmax = 0;
+  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.
+
+  ResetFrameInfo(&info);
+
+  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
+    fprintf(stderr, "Error! Version mismatch!\n");
+    return -1;
+  }
+  config.lossless = 1;  // Use lossless compression by default.
+  config.image_hint = WEBP_HINT_GRAPH;   // always low-color
+
+  if (argc == 1) {
+    Help();
+    return 0;
+  }
+
+  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
+    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      Help();
+      return 0;
+    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
+      out_file = argv[++c];
+    } else if (!strcmp(argv[c], "-lossy")) {
+      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-mixed")) {
+      allow_mixed = 1;
+      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
+      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
+    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
+      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
+      kmax = ExUtilGetUInt(argv[++c], 0, &parse_error);
+      default_kmax = 0;
+    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
+      kmin = ExUtilGetUInt(argv[++c], 0, &parse_error);
+      default_kmin = 0;
+    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
+      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-metadata") && c < argc - 1) {
+      static const struct {
+        const char* option;
+        int flag;
+      } kTokens[] = {
+        { "all",  METADATA_ALL },
+        { "none", 0 },
+        { "icc",  METADATA_ICC },
+        { "xmp",  METADATA_XMP },
+      };
+      const size_t kNumTokens = sizeof(kTokens) / sizeof(*kTokens);
+      const char* start = argv[++c];
+      const char* const end = start + strlen(start);
+
+      keep_metadata = 0;
+      while (start < end) {
+        size_t i;
+        const char* token = strchr(start, ',');
+        if (token == NULL) token = end;
+
+        for (i = 0; i < kNumTokens; ++i) {
+          if ((size_t)(token - start) == strlen(kTokens[i].option) &&
+              !strncmp(start, kTokens[i].option, strlen(kTokens[i].option))) {
+            if (kTokens[i].flag != 0) {
+              keep_metadata |= kTokens[i].flag;
+            } else {
+              keep_metadata = 0;
+            }
+            break;
+          }
+        }
+        if (i == kNumTokens) {
+          fprintf(stderr, "Error! Unknown metadata type '%.*s'\n",
+                  (int)(token - start), start);
+          Help();
+          return -1;
+        }
+        start = token + 1;
+      }
+    } else if (!strcmp(argv[c], "-mt")) {
+      ++config.thread_level;
+    } else if (!strcmp(argv[c], "-version")) {
+      const int enc_version = WebPGetEncoderVersion();
+      const int mux_version = WebPGetMuxVersion();
+      printf("WebP Encoder version: %d.%d.%d\nWebP Mux version: %d.%d.%d\n",
+             (enc_version >> 16) & 0xff, (enc_version >> 8) & 0xff,
+             enc_version & 0xff, (mux_version >> 16) & 0xff,
+             (mux_version >> 8) & 0xff, mux_version & 0xff);
+      return 0;
+    } else if (!strcmp(argv[c], "-quiet")) {
+      quiet = 1;
+    } else if (!strcmp(argv[c], "-v")) {
+      verbose = 1;
+    } else if (!strcmp(argv[c], "--")) {
+      if (c < argc - 1) in_file = argv[++c];
+      break;
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
+      Help();
+      return -1;
+    } else {
+      in_file = argv[c];
+    }
+
+    if (parse_error) {
+      Help();
+      return -1;
+    }
+  }
+
+  // Appropriate default kmin, kmax values for lossy and lossless.
+  if (default_kmin) {
+    kmin = config.lossless ? 9 : 3;
+  }
+  if (default_kmax) {
+    kmax = config.lossless ? 17 : 5;
+  }
+  SanitizeKeyFrameIntervals(&kmin, &kmax);
+
+  if (!WebPValidateConfig(&config)) {
+    fprintf(stderr, "Error! Invalid configuration.\n");
+    goto End;
+  }
+
+  if (in_file == NULL) {
+    fprintf(stderr, "No input file specified!\n");
+    Help();
+    goto End;
+  }
+
+  // Start the decoder object
+#if LOCAL_GIF_PREREQ(5,0)
+  gif = DGifOpenFileName(in_file, &gif_error);
+#else
+  gif = DGifOpenFileName(in_file);
+#endif
+  if (gif == NULL) goto End;
+
+  mux = WebPMuxNew();
+  if (mux == NULL) {
+    fprintf(stderr, "ERROR: could not create a mux object.\n");
+    goto End;
+  }
+
+  // Loop over GIF images
+  done = 0;
+  do {
+    GifRecordType type;
+    if (DGifGetRecordType(gif, &type) == GIF_ERROR) goto End;
+
+    switch (type) {
+      case IMAGE_DESC_RECORD_TYPE: {
+        WebPFrameRect gif_rect;
+        GifImageDesc* const image_desc = &gif->Image;
+
+        if (!DGifGetImageDesc(gif)) goto End;
+
+        // Fix some broken GIF global headers that report
+        // 0 x 0 screen dimension.
+        if (is_first_frame) {
+          if (verbose) {
+            printf("Canvas screen: %d x %d\n", gif->SWidth, gif->SHeight);
+          }
+          if (gif->SWidth == 0 || gif->SHeight == 0) {
+            image_desc->Left = 0;
+            image_desc->Top = 0;
+            gif->SWidth = image_desc->Width;
+            gif->SHeight = image_desc->Height;
+            if (gif->SWidth <= 0 || gif->SHeight <= 0) {
+              goto End;
+            }
+            if (verbose) {
+              printf("Fixed canvas screen dimension to: %d x %d\n",
+                     gif->SWidth, gif->SHeight);
+            }
+          }
+#if WEBP_MUX_ABI_VERSION > 0x0101
+          // Set definitive canvas size.
+          err = WebPMuxSetCanvasSize(mux, gif->SWidth, gif->SHeight);
+          if (err != WEBP_MUX_OK) {
+            fprintf(stderr, "Invalid canvas size %d x %d\n",
+                    gif->SWidth, gif->SHeight);
+            goto End;
+          }
+#endif
+          // Allocate current buffer.
+          frame.width = gif->SWidth;
+          frame.height = gif->SHeight;
+          frame.use_argb = 1;
+          if (!WebPPictureAlloc(&frame)) goto End;
+          WebPUtilClearPic(&frame, NULL);
+
+          // Initialize cache.
+          cache = WebPFrameCacheNew(frame.width, frame.height,
+                                    kmin, kmax, allow_mixed);
+          if (cache == NULL) goto End;
+
+          // Background color.
+          GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
+                             &anim.bgcolor);
+        }
+        // Some even more broken GIF can have sub-rect with zero width/height.
+        if (image_desc->Width == 0 || image_desc->Height == 0) {
+          image_desc->Width = gif->SWidth;
+          image_desc->Height = gif->SHeight;
+        }
+
+        if (!ReadFrame(gif, &gif_rect, &frame)) {
+          goto End;
+        }
+
+        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, &frame, &info)) {
+          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
+          fprintf(stderr, "Error code: %d\n", frame.error_code);
+        }
+
+        err = WebPFrameCacheFlush(cache, verbose, mux);
+        if (err != WEBP_MUX_OK) {
+          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
+                  ErrorString(err));
+          goto End;
+        }
+        is_first_frame = 0;
+
+        // In GIF, graphic control extensions are optional for a frame, so we
+        // may not get one before reading the next frame. To handle this case,
+        // we reset frame properties to reasonable defaults for the next frame.
+        ResetFrameInfo(&info);
+        break;
+      }
+      case EXTENSION_RECORD_TYPE: {
+        int extension;
+        GifByteType *data = NULL;
+        if (DGifGetExtension(gif, &extension, &data) == GIF_ERROR) {
+          goto End;
+        }
+        switch (extension) {
+          case COMMENT_EXT_FUNC_CODE: {
+            break;  // Do nothing for now.
+          }
+          case GRAPHICS_EXT_FUNC_CODE: {
+            const int flags = data[1];
+            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
+            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
+            if (data[0] != 4) goto End;
+            info.duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            if (dispose == 3) {
+              static int warning_printed = 0;
+              if (!warning_printed) {
+                fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE unsupported.\n");
+                warning_printed = 1;
+              }
+              // failsafe. TODO(urvang): emulate the correct behaviour by
+              // recoding the whole frame.
+              info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+            } else {
+              info.dispose_method =
+                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
+                                 : WEBP_MUX_DISPOSE_NONE;
+            }
+            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
+            break;
+          }
+          case PLAINTEXT_EXT_FUNC_CODE: {
+            break;
+          }
+          case APPLICATION_EXT_FUNC_CODE: {
+            if (data[0] != 11) break;    // Chunk is too short
+            if (!memcmp(data + 1, "NETSCAPE2.0", 11) ||
+                !memcmp(data + 1, "ANIMEXTS1.0", 11)) {
+              // Recognize and parse Netscape2.0 NAB extension for loop count.
+              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+              if (data == NULL) goto End;  // Loop count sub-block missing.
+              if (data[0] < 3 || data[1] != 1) break;   // wrong size/marker
+              anim.loop_count = data[2] | (data[3] << 8);
+              if (verbose) {
+                fprintf(stderr, "Loop count: %d\n", anim.loop_count);
+              }
+            } else {  // An extension containing metadata.
+              // We only store the first encountered chunk of each type, and
+              // only if requested by the user.
+              const int is_xmp = (keep_metadata & METADATA_XMP) &&
+                                 !stored_xmp &&
+                                 !memcmp(data + 1, "XMP DataXMP", 11);
+              const int is_icc = (keep_metadata & METADATA_ICC) &&
+                                 !stored_icc &&
+                                 !memcmp(data + 1, "ICCRGBG1012", 11);
+              if (is_xmp || is_icc) {
+                const char* const fourccs[2] = { "XMP " , "ICCP" };
+                const char* const features[2] = { "XMP" , "ICC" };
+                WebPData metadata = { NULL, 0 };
+                // Construct metadata from sub-blocks.
+                // Usual case (including ICC profile): In each sub-block, the
+                // first byte specifies its size in bytes (0 to 255) and the
+                // rest of the bytes contain the data.
+                // Special case for XMP data: In each sub-block, the first byte
+                // is also part of the XMP payload. XMP in GIF also has a 257
+                // byte padding data. See the XMP specification for details.
+                while (1) {
+                  WebPData prev_metadata = metadata;
+                  WebPData subblock;
+                  if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) {
+                    WebPDataClear(&metadata);
+                    goto End;
+                  }
+                  if (data == NULL) break;  // Finished.
+                  subblock.size = is_xmp ? data[0] + 1 : data[0];
+                  assert(subblock.size > 0);
+                  subblock.bytes = is_xmp ? data : data + 1;
+                  metadata.bytes =
+                      (uint8_t*)realloc((void*)metadata.bytes,
+                                        prev_metadata.size + subblock.size);
+                  if (metadata.bytes == NULL) {
+                    WebPDataClear(&prev_metadata);
+                    goto End;
+                  }
+                  metadata.size += subblock.size;
+                  memcpy((void*)(metadata.bytes + prev_metadata.size),
+                         subblock.bytes, subblock.size);
+                }
+                if (is_xmp) {
+                  // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
+                  const size_t xmp_pading_size = 257;
+                  if (metadata.size > xmp_pading_size) {
+                    metadata.size -= xmp_pading_size;
+                  }
+                }
+
+                // Add metadata chunk.
+                err = WebPMuxSetChunk(mux, fourccs[is_icc], &metadata, 1);
+                if (verbose) {
+                  fprintf(stderr, "%s size: %d\n",
+                          features[is_icc], (int)metadata.size);
+                }
+                WebPDataClear(&metadata);
+                if (err != WEBP_MUX_OK) {
+                  fprintf(stderr, "ERROR (%s): Could not set %s chunk.\n",
+                          ErrorString(err), features[is_icc]);
+                  goto End;
+                }
+                if (is_icc) {
+                  stored_icc = 1;
+                } else if (is_xmp) {
+                  stored_xmp = 1;
+                }
+              }
+            }
+            break;
+          }
+          default: {
+            break;  // skip
+          }
+        }
+        while (data != NULL) {
+          if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+        }
+        break;
+      }
+      case TERMINATE_RECORD_TYPE: {
+        done = 1;
+        break;
+      }
+      default: {
+        if (verbose) {
+          fprintf(stderr, "Skipping over unknown record type %d\n", type);
+        }
+        break;
+      }
+    }
+  } while (!done);
+
+  // Flush any pending frames.
+  err = WebPFrameCacheFlushAll(cache, verbose, mux);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
+            ErrorString(err));
+    goto End;
+  }
+
+  // Finish muxing
+  err = WebPMuxSetAnimationParams(mux, &anim);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s): Could not set animation parameters.\n",
+            ErrorString(err));
+    goto End;
+  }
+
+  err = WebPMuxAssemble(mux, &webp_data);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s) assembling the WebP file.\n", ErrorString(err));
+    goto End;
+  }
+  if (out_file != NULL) {
+    if (!ExUtilWriteFile(out_file, webp_data.bytes, webp_data.size)) {
+      fprintf(stderr, "Error writing output file: %s\n", out_file);
+      goto End;
+    }
+    if (!quiet) {
+      fprintf(stderr, "Saved output file: %s\n", out_file);
+    }
+  } else {
+    if (!quiet) {
+      fprintf(stderr, "Nothing written; use -o flag to save the result.\n");
+    }
+  }
+
+  // All OK.
+  ok = 1;
+  gif_error = GIF_OK;
+
+ End:
+  WebPDataClear(&webp_data);
+  WebPMuxDelete(mux);
+  WebPPictureFree(&frame);
+  WebPFrameCacheDelete(cache);
+  if (out != NULL && out_file != NULL) fclose(out);
+
+  if (gif_error != GIF_OK) {
+    DisplayGifError(gif, gif_error);
+  }
+  if (gif != NULL) {
+#if LOCAL_GIF_PREREQ(5,1)
+    DGifCloseFile(gif, &gif_error);
+#else
+    DGifCloseFile(gif);
+#endif
+  }
+
+  return !ok;
+}
+
+#else  // !WEBP_HAVE_GIF
+
+int main(int argc, const char *argv[]) {
+  fprintf(stderr, "GIF support not enabled in %s.\n", argv[0]);
+  (void)argc;
+  return 0;
+}
+
+#endif
+
+//------------------------------------------------------------------------------
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
@ -0,0 +1,708 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Helper structs and methods for gif2webp tool.
+//
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "webp/encode.h"
+#include "./gif2webp_util.h"
+
+#define DELTA_INFINITY      1ULL << 32
+#define KEYFRAME_NONE       -1
+
+//------------------------------------------------------------------------------
+// Helper utilities.
+
+static void ClearRectangle(WebPPicture* const picture,
+                           int left, int top, int width, int height) {
+  int j;
+  for (j = top; j < top + height; ++j) {
+    uint32_t* const dst = picture->argb + j * picture->argb_stride;
+    int i;
+    for (i = left; i < left + width; ++i) {
+      dst[i] = WEBP_UTIL_TRANSPARENT_COLOR;
+    }
+  }
+}
+
+void WebPUtilClearPic(WebPPicture* const picture,
+                      const WebPFrameRect* const rect) {
+  if (rect != NULL) {
+    ClearRectangle(picture, rect->x_offset, rect->y_offset,
+                   rect->width, rect->height);
+  } else {
+    ClearRectangle(picture, 0, 0, picture->width, picture->height);
+  }
+}
+
+// TODO: Also used in picture.c. Move to a common location?
+// Copy width x height pixels from 'src' to 'dst' honoring the strides.
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// Copy pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are assumed
+// to be already allocated.
+static void CopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
+  assert(src->width == dst->width && src->height == dst->height);
+  CopyPlane((uint8_t*)src->argb, 4 * src->argb_stride, (uint8_t*)dst->argb,
+            4 * dst->argb_stride, 4 * src->width, src->height);
+}
+
+// Given 'src' picture and its frame rectangle 'rect', blend it into 'dst'.
+static void BlendPixels(const WebPPicture* const src,
+                        const WebPFrameRect* const rect,
+                        WebPPicture* const dst) {
+  int j;
+  assert(src->width == dst->width && src->height == dst->height);
+  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
+    int i;
+    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
+      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
+      const int src_alpha = src_pixel >> 24;
+      if (src_alpha != 0) {
+        dst->argb[j * dst->argb_stride + i] = src_pixel;
+      }
+    }
+  }
+}
+
+// Replace transparent pixels within 'dst_rect' of 'dst' by those in the 'src'.
+static void ReduceTransparency(const WebPPicture* const src,
+                               const WebPFrameRect* const rect,
+                               WebPPicture* const dst) {
+  int i, j;
+  assert(src != NULL && dst != NULL && rect != NULL);
+  assert(src->width == dst->width && src->height == dst->height);
+  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
+    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
+      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
+      const int src_alpha = src_pixel >> 24;
+      const uint32_t dst_pixel = dst->argb[j * dst->argb_stride + i];
+      const int dst_alpha = dst_pixel >> 24;
+      if (dst_alpha == 0 && src_alpha == 0xff) {
+        dst->argb[j * dst->argb_stride + i] = src_pixel;
+      }
+    }
+  }
+}
+
+// Replace similar blocks of pixels by a 'see-through' transparent block
+// with uniform average color.
+static void FlattenSimilarBlocks(const WebPPicture* const src,
+                                 const WebPFrameRect* const rect,
+                                 WebPPicture* const dst) {
+  int i, j;
+  const int block_size = 8;
+  const int y_start = (rect->y_offset + block_size) & ~(block_size - 1);
+  const int y_end = (rect->y_offset + rect->height) & ~(block_size - 1);
+  const int x_start = (rect->x_offset + block_size) & ~(block_size - 1);
+  const int x_end = (rect->x_offset + rect->width) & ~(block_size - 1);
+  assert(src != NULL && dst != NULL && rect != NULL);
+  assert(src->width == dst->width && src->height == dst->height);
+  assert((block_size & (block_size - 1)) == 0);  // must be a power of 2
+  // Iterate over each block and count similar pixels.
+  for (j = y_start; j < y_end; j += block_size) {
+    for (i = x_start; i < x_end; i += block_size) {
+      int cnt = 0;
+      int avg_r = 0, avg_g = 0, avg_b = 0;
+      int x, y;
+      const uint32_t* const psrc = src->argb + j * src->argb_stride + i;
+      uint32_t* const pdst = dst->argb + j * dst->argb_stride + i;
+      for (y = 0; y < block_size; ++y) {
+        for (x = 0; x < block_size; ++x) {
+          const uint32_t src_pixel = psrc[x + y * src->argb_stride];
+          const int alpha = src_pixel >> 24;
+          if (alpha == 0xff &&
+              src_pixel == pdst[x + y * dst->argb_stride]) {
+              ++cnt;
+              avg_r += (src_pixel >> 16) & 0xff;
+              avg_g += (src_pixel >>  8) & 0xff;
+              avg_b += (src_pixel >>  0) & 0xff;
+          }
+        }
+      }
+      // If we have a fully similar block, we replace it with an
+      // average transparent block. This compresses better in lossy mode.
+      if (cnt == block_size * block_size) {
+        const uint32_t color = (0x00          << 24) |
+                               ((avg_r / cnt) << 16) |
+                               ((avg_g / cnt) <<  8) |
+                               ((avg_b / cnt) <<  0);
+        for (y = 0; y < block_size; ++y) {
+          for (x = 0; x < block_size; ++x) {
+            pdst[x + y * dst->argb_stride] = color;
+          }
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Key frame related utilities.
+
+// Returns true if 'curr' frame with frame rectangle 'curr_rect' is a key frame,
+// that is, it can be decoded independently of 'prev' canvas.
+static int IsKeyFrame(const WebPPicture* const curr,
+                      const WebPFrameRect* const curr_rect,
+                      const WebPPicture* const prev) {
+  int i, j;
+  int is_key_frame = 1;
+
+  // If previous canvas (with previous frame disposed) is all transparent,
+  // current frame is a key frame.
+  for (j = 0; j < prev->height; ++j) {
+    const uint32_t* const row = &prev->argb[j * prev->argb_stride];
+    for (i = 0; i < prev->width; ++i) {
+      if (row[i] & 0xff000000u) {   // has alpha?
+        is_key_frame = 0;
+        break;
+      }
+    }
+    if (!is_key_frame) break;
+  }
+  if (is_key_frame) return 1;
+
+  // If current frame covers the whole canvas and does not contain any
+  // transparent pixels that depend on previous canvas, then current frame is
+  // a key frame.
+  if (curr_rect->width == curr->width && curr_rect->height == curr->height) {
+    assert(curr_rect->x_offset == 0 && curr_rect->y_offset == 0);
+    is_key_frame = 1;
+    for (j = 0; j < prev->height; ++j) {
+      for (i = 0; i < prev->width; ++i) {
+        const uint32_t prev_alpha =
+            (prev->argb[j * prev->argb_stride + i]) >> 24;
+        const uint32_t curr_alpha =
+            (curr->argb[j * curr->argb_stride + i]) >> 24;
+        if (curr_alpha != 0xff && prev_alpha != 0) {
+          is_key_frame = 0;
+          break;
+        }
+      }
+      if (!is_key_frame) break;
+    }
+    if (is_key_frame) return 1;
+  }
+
+  return 0;
+}
+
+// Given 'prev' frame and current frame rectangle 'rect', convert 'curr' frame
+// to a key frame.
+static void ConvertToKeyFrame(const WebPPicture* const prev,
+                              WebPFrameRect* const rect,
+                              WebPPicture* const curr) {
+  int j;
+  assert(curr->width == prev->width && curr->height == prev->height);
+
+  // Replace transparent pixels of current canvas with those from previous
+  // canvas (with previous frame disposed).
+  for (j = 0; j < curr->height; ++j) {
+    int i;
+    for (i = 0; i < curr->width; ++i) {
+      uint32_t* const curr_pixel = curr->argb + j * curr->argb_stride + i;
+      const int curr_alpha = *curr_pixel >> 24;
+      if (curr_alpha == 0) {
+        *curr_pixel = prev->argb[j * prev->argb_stride + i];
+      }
+    }
+  }
+
+  // Frame rectangle now covers the whole canvas.
+  rect->x_offset = 0;
+  rect->y_offset = 0;
+  rect->width = curr->width;
+  rect->height = curr->height;
+}
+
+//------------------------------------------------------------------------------
+// Encoded frame.
+
+// Used to store two candidates of encoded data for an animation frame. One of
+// the two will be chosen later.
+typedef struct {
+  WebPMuxFrameInfo sub_frame;  // Encoded frame rectangle.
+  WebPMuxFrameInfo key_frame;  // Encoded frame if it was converted to keyframe.
+} EncodedFrame;
+
+// Release the data contained by 'encoded_frame'.
+static void FrameRelease(EncodedFrame* const encoded_frame) {
+  if (encoded_frame != NULL) {
+    WebPDataClear(&encoded_frame->sub_frame.bitstream);
+    WebPDataClear(&encoded_frame->key_frame.bitstream);
+    memset(encoded_frame, 0, sizeof(*encoded_frame));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Frame cache.
+
+// Used to store encoded frames that haven't been output yet.
+struct WebPFrameCache {
+  EncodedFrame* encoded_frames;  // Array of encoded frames.
+  size_t size;               // Number of allocated data elements.
+  size_t start;              // Start index.
+  size_t count;              // Number of valid data elements.
+  int flush_count;           // If >0, ‘flush_count’ frames starting from
+                             // 'start' are ready to be added to mux.
+  int64_t best_delta;        // min(canvas size - frame size) over the frames.
+                             // Can be negative in certain cases due to
+                             // transparent pixels in a frame.
+  int keyframe;              // Index of selected keyframe relative to 'start'.
+
+  size_t kmin;                   // Min distance between key frames.
+  size_t kmax;                   // Max distance between key frames.
+  size_t count_since_key_frame;  // Frames seen since the last key frame.
+  int allow_mixed;           // If true, each frame can be lossy or lossless.
+  WebPPicture prev_canvas;   // Previous canvas (properly disposed).
+  WebPPicture curr_canvas;   // Current canvas (temporary buffer).
+  int is_first_frame;        // True if no frames have been added to the cache
+                             // since WebPFrameCacheNew().
+};
+
+// Reset the counters in the cache struct. Doesn't touch 'cache->encoded_frames'
+// and 'cache->size'.
+static void CacheReset(WebPFrameCache* const cache) {
+  cache->start = 0;
+  cache->count = 0;
+  cache->flush_count = 0;
+  cache->best_delta = DELTA_INFINITY;
+  cache->keyframe = KEYFRAME_NONE;
+}
+
+WebPFrameCache* WebPFrameCacheNew(int width, int height,
+                                  size_t kmin, size_t kmax, int allow_mixed) {
+  WebPFrameCache* cache = (WebPFrameCache*)malloc(sizeof(*cache));
+  if (cache == NULL) return NULL;
+  CacheReset(cache);
+  // sanity init, so we can call WebPFrameCacheDelete():
+  cache->encoded_frames = NULL;
+
+  cache->is_first_frame = 1;
+
+  // Picture buffers.
+  if (!WebPPictureInit(&cache->prev_canvas) ||
+      !WebPPictureInit(&cache->curr_canvas)) {
+    return NULL;
+  }
+  cache->prev_canvas.width = width;
+  cache->prev_canvas.height = height;
+  cache->prev_canvas.use_argb = 1;
+  if (!WebPPictureAlloc(&cache->prev_canvas) ||
+      !WebPPictureCopy(&cache->prev_canvas, &cache->curr_canvas)) {
+    goto Err;
+  }
+  WebPUtilClearPic(&cache->prev_canvas, NULL);
+
+  // Cache data.
+  cache->allow_mixed = allow_mixed;
+  cache->kmin = kmin;
+  cache->kmax = kmax;
+  cache->count_since_key_frame = 0;
+  assert(kmax > kmin);
+  cache->size = kmax - kmin;
+  cache->encoded_frames =
+      (EncodedFrame*)calloc(cache->size, sizeof(*cache->encoded_frames));
+  if (cache->encoded_frames == NULL) goto Err;
+
+  return cache;  // All OK.
+
+ Err:
+  WebPFrameCacheDelete(cache);
+  return NULL;
+}
+
+void WebPFrameCacheDelete(WebPFrameCache* const cache) {
+  if (cache != NULL) {
+    if (cache->encoded_frames != NULL) {
+      size_t i;
+      for (i = 0; i < cache->size; ++i) {
+        FrameRelease(&cache->encoded_frames[i]);
+      }
+      free(cache->encoded_frames);
+    }
+    WebPPictureFree(&cache->prev_canvas);
+    WebPPictureFree(&cache->curr_canvas);
+    free(cache);
+  }
+}
+
+static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
+                       WebPMemoryWriter* const memory) {
+  pic->use_argb = 1;
+  pic->writer = WebPMemoryWrite;
+  pic->custom_ptr = memory;
+  if (!WebPEncode(config, pic)) {
+    return 0;
+  }
+  return 1;
+}
+
+static void GetEncodedData(const WebPMemoryWriter* const memory,
+                           WebPData* const encoded_data) {
+  encoded_data->bytes = memory->mem;
+  encoded_data->size  = memory->size;
+}
+
+#define MIN_COLORS_LOSSY     31  // Don't try lossy below this threshold.
+#define MAX_COLORS_LOSSLESS 194  // Don't try lossless above this threshold.
+#define MAX_COLOR_COUNT     256  // Power of 2 greater than MAX_COLORS_LOSSLESS.
+#define HASH_SIZE (MAX_COLOR_COUNT * 4)
+#define HASH_RIGHT_SHIFT     22  // 32 - log2(HASH_SIZE).
+
+// TODO(urvang): Also used in enc/vp8l.c. Move to utils.
+// If the number of colors in the 'pic' is at least MAX_COLOR_COUNT, return
+// MAX_COLOR_COUNT. Otherwise, return the exact number of colors in the 'pic'.
+static int GetColorCount(const WebPPicture* const pic) {
+  int x, y;
+  int num_colors = 0;
+  uint8_t in_use[HASH_SIZE] = { 0 };
+  uint32_t colors[HASH_SIZE];
+  static const uint32_t kHashMul = 0x1e35a7bd;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int key;
+      if (argb[x] == last_pix) {
+        continue;
+      }
+      last_pix = argb[x];
+      key = (kHashMul * last_pix) >> HASH_RIGHT_SHIFT;
+      while (1) {
+        if (!in_use[key]) {
+          colors[key] = last_pix;
+          in_use[key] = 1;
+          ++num_colors;
+          if (num_colors >= MAX_COLOR_COUNT) {
+            return MAX_COLOR_COUNT;  // Exact count not needed.
+          }
+          break;
+        } else if (colors[key] == last_pix) {
+          break;  // The color is already there.
+        } else {
+          // Some other color sits here, so do linear conflict resolution.
+          ++key;
+          key &= (HASH_SIZE - 1);  // Key mask.
+        }
+      }
+    }
+    argb += pic->argb_stride;
+  }
+  return num_colors;
+}
+
+#undef MAX_COLOR_COUNT
+#undef HASH_SIZE
+#undef HASH_RIGHT_SHIFT
+
+static WebPEncodingError SetFrame(const WebPConfig* const config,
+                                  int allow_mixed, int is_key_frame,
+                                  const WebPPicture* const prev_canvas,
+                                  WebPPicture* const frame,
+                                  const WebPFrameRect* const rect,
+                                  const WebPMuxFrameInfo* const info,
+                                  WebPPicture* const sub_frame,
+                                  EncodedFrame* encoded_frame) {
+  WebPEncodingError error_code = VP8_ENC_OK;
+  int try_lossless;
+  int try_lossy;
+  int try_both;
+  WebPMemoryWriter mem1, mem2;
+  WebPData* encoded_data;
+  WebPMuxFrameInfo* const dst =
+      is_key_frame ? &encoded_frame->key_frame : &encoded_frame->sub_frame;
+  *dst = *info;
+  encoded_data = &dst->bitstream;
+  WebPMemoryWriterInit(&mem1);
+  WebPMemoryWriterInit(&mem2);
+
+  if (!allow_mixed) {
+    try_lossless = config->lossless;
+    try_lossy = !try_lossless;
+  } else {  // Use a heuristic for trying lossless and/or lossy compression.
+    const int num_colors = GetColorCount(sub_frame);
+    try_lossless = (num_colors < MAX_COLORS_LOSSLESS);
+    try_lossy = (num_colors >= MIN_COLORS_LOSSY);
+  }
+  try_both = try_lossless && try_lossy;
+
+  if (try_lossless) {
+    WebPConfig config_ll = *config;
+    config_ll.lossless = 1;
+    if (!EncodeFrame(&config_ll, sub_frame, &mem1)) {
+      error_code = sub_frame->error_code;
+      goto Err;
+    }
+  }
+
+  if (try_lossy) {
+    WebPConfig config_lossy = *config;
+    config_lossy.lossless = 0;
+    if (!is_key_frame) {
+      // For lossy compression of a frame, it's better to replace transparent
+      // pixels of 'curr' with actual RGB values, whenever possible.
+      ReduceTransparency(prev_canvas, rect, frame);
+      // TODO(later): Investigate if this helps lossless compression as well.
+      FlattenSimilarBlocks(prev_canvas, rect, frame);
+    }
+    if (!EncodeFrame(&config_lossy, sub_frame, &mem2)) {
+      error_code = sub_frame->error_code;
+      goto Err;
+    }
+  }
+
+  if (try_both) {  // Pick the encoding with smallest size.
+    // TODO(later): Perhaps a rough SSIM/PSNR produced by the encoder should
+    // also be a criteria, in addition to sizes.
+    if (mem1.size <= mem2.size) {
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
+      WebPMemoryWriterClear(&mem2);
+#else
+      free(mem2.mem);
+      memset(&mem2, 0, sizeof(mem2));
+#endif
+      GetEncodedData(&mem1, encoded_data);
+    } else {
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
+      WebPMemoryWriterClear(&mem1);
+#else
+      free(mem1.mem);
+      memset(&mem1, 0, sizeof(mem1));
+#endif
+      GetEncodedData(&mem2, encoded_data);
+    }
+  } else {
+    GetEncodedData(try_lossless ? &mem1 : &mem2, encoded_data);
+  }
+  return error_code;
+
+ Err:
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
+  WebPMemoryWriterClear(&mem1);
+  WebPMemoryWriterClear(&mem2);
+#else
+  free(mem1.mem);
+  free(mem2.mem);
+#endif
+  return error_code;
+}
+
+#undef MIN_COLORS_LOSSY
+#undef MAX_COLORS_LOSSLESS
+
+// Returns cached frame at given 'position' index.
+static EncodedFrame* CacheGetFrame(const WebPFrameCache* const cache,
+                                   size_t position) {
+  assert(cache->start + position < cache->size);
+  return &cache->encoded_frames[cache->start + position];
+}
+
+// Calculate the penalty incurred if we encode given frame as a key frame
+// instead of a sub-frame.
+static int64_t KeyFramePenalty(const EncodedFrame* const encoded_frame) {
+  return ((int64_t)encoded_frame->key_frame.bitstream.size -
+          encoded_frame->sub_frame.bitstream.size);
+}
+
+static void DisposeFrame(WebPMuxAnimDispose dispose_method,
+                         const WebPFrameRect* const gif_rect,
+                         WebPPicture* const frame, WebPPicture* const canvas) {
+  if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+    WebPUtilClearPic(frame, NULL);
+    WebPUtilClearPic(canvas, gif_rect);
+  }
+}
+
+int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
+                           const WebPConfig* const config,
+                           const WebPFrameRect* const orig_rect_ptr,
+                           WebPPicture* const frame,
+                           WebPMuxFrameInfo* const info) {
+  int ok = 0;
+  WebPEncodingError error_code = VP8_ENC_OK;
+  WebPFrameRect rect;
+  WebPPicture sub_image;  // View extracted from 'frame' with rectangle 'rect'.
+  WebPPicture* const prev_canvas = &cache->prev_canvas;
+  const size_t position = cache->count;
+  const int allow_mixed = cache->allow_mixed;
+  EncodedFrame* const encoded_frame = CacheGetFrame(cache, position);
+  WebPFrameRect orig_rect;
+  assert(position < cache->size);
+
+  if (frame == NULL || info == NULL) {
+    return 0;
+  }
+
+  if (orig_rect_ptr == NULL) {
+    orig_rect.width = frame->width;
+    orig_rect.height = frame->height;
+    orig_rect.x_offset = 0;
+    orig_rect.y_offset = 0;
+  } else {
+    orig_rect = *orig_rect_ptr;
+  }
+
+  // Snap to even offsets (and adjust dimensions if needed).
+  rect = orig_rect;
+  rect.width += (rect.x_offset & 1);
+  rect.height += (rect.y_offset & 1);
+  rect.x_offset &= ~1;
+  rect.y_offset &= ~1;
+
+  if (!WebPPictureView(frame, rect.x_offset, rect.y_offset,
+                       rect.width, rect.height, &sub_image)) {
+    return 0;
+  }
+  info->x_offset = rect.x_offset;
+  info->y_offset = rect.y_offset;
+
+  ++cache->count;
+
+  if (cache->is_first_frame || IsKeyFrame(frame, &rect, prev_canvas)) {
+    // Add this as a key frame.
+    error_code = SetFrame(config, allow_mixed, 1, NULL, NULL, NULL,
+                          info, &sub_image, encoded_frame);
+    if (error_code != VP8_ENC_OK) {
+      goto End;
+    }
+    cache->keyframe = position;
+    cache->flush_count = cache->count;
+    cache->count_since_key_frame = 0;
+    // Update prev_canvas by simply copying from 'curr'.
+    CopyPixels(frame, prev_canvas);
+  } else {
+    ++cache->count_since_key_frame;
+    if (cache->count_since_key_frame <= cache->kmin) {
+      // Add this as a frame rectangle.
+      error_code = SetFrame(config, allow_mixed, 0, prev_canvas, frame,
+                            &rect, info, &sub_image, encoded_frame);
+      if (error_code != VP8_ENC_OK) {
+        goto End;
+      }
+      cache->flush_count = cache->count;
+      // Update prev_canvas by blending 'curr' into it.
+      BlendPixels(frame, &orig_rect, prev_canvas);
+    } else {
+      WebPPicture full_image;
+      WebPMuxFrameInfo full_image_info;
+      int64_t curr_delta;
+
+      // Add frame rectangle to cache.
+      error_code = SetFrame(config, allow_mixed, 0, prev_canvas, frame, &rect,
+                            info, &sub_image, encoded_frame);
+      if (error_code != VP8_ENC_OK) {
+        goto End;
+      }
+
+      // Convert to a key frame.
+      CopyPixels(frame, &cache->curr_canvas);
+      ConvertToKeyFrame(prev_canvas, &rect, &cache->curr_canvas);
+      if (!WebPPictureView(&cache->curr_canvas, rect.x_offset, rect.y_offset,
+                           rect.width, rect.height, &full_image)) {
+        goto End;
+      }
+      full_image_info = *info;
+      full_image_info.x_offset = rect.x_offset;
+      full_image_info.y_offset = rect.y_offset;
+
+      // Add key frame to cache, too.
+      error_code = SetFrame(config, allow_mixed, 1, NULL, NULL, NULL,
+                            &full_image_info, &full_image, encoded_frame);
+      WebPPictureFree(&full_image);
+      if (error_code != VP8_ENC_OK) goto End;
+
+      // Analyze size difference of the two variants.
+      curr_delta = KeyFramePenalty(encoded_frame);
+      if (curr_delta <= cache->best_delta) {  // Pick this as keyframe.
+        cache->keyframe = position;
+        cache->best_delta = curr_delta;
+        cache->flush_count = cache->count - 1;  // We can flush previous frames.
+      }
+      if (cache->count_since_key_frame == cache->kmax) {
+        cache->flush_count = cache->count;
+        cache->count_since_key_frame = 0;
+      }
+
+      // Update prev_canvas by simply copying from 'curr_canvas'.
+      CopyPixels(&cache->curr_canvas, prev_canvas);
+    }
+  }
+
+  DisposeFrame(info->dispose_method, &orig_rect, frame, prev_canvas);
+
+  cache->is_first_frame = 0;
+  ok = 1;
+
+ End:
+  WebPPictureFree(&sub_image);
+  if (!ok) {
+    FrameRelease(encoded_frame);
+    --cache->count;  // We reset the count, as the frame addition failed.
+  }
+  frame->error_code = error_code;   // report error_code
+  assert(ok || error_code != VP8_ENC_OK);
+  return ok;
+}
+
+WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
+                                 WebPMux* const mux) {
+  while (cache->flush_count > 0) {
+    WebPMuxFrameInfo* info;
+    WebPMuxError err;
+    EncodedFrame* const curr = CacheGetFrame(cache, 0);
+    // Pick frame or full canvas.
+    if (cache->keyframe == 0) {
+      info = &curr->key_frame;
+      info->blend_method = WEBP_MUX_NO_BLEND;
+      cache->keyframe = KEYFRAME_NONE;
+      cache->best_delta = DELTA_INFINITY;
+    } else {
+      info = &curr->sub_frame;
+      info->blend_method = WEBP_MUX_BLEND;
+    }
+    // Add to mux.
+    err = WebPMuxPushFrame(mux, info, 1);
+    if (err != WEBP_MUX_OK) return err;
+    if (verbose) {
+      printf("Added frame. offset:%d,%d duration:%d dispose:%d blend:%d\n",
+             info->x_offset, info->y_offset, info->duration,
+             info->dispose_method, info->blend_method);
+    }
+    FrameRelease(curr);
+    ++cache->start;
+    --cache->flush_count;
+    --cache->count;
+    if (cache->keyframe != KEYFRAME_NONE) --cache->keyframe;
+  }
+
+  if (cache->count == 0) CacheReset(cache);
+  return WEBP_MUX_OK;
+}
+
+WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
+                                    WebPMux* const mux) {
+  cache->flush_count = cache->count;  // Force flushing of all frames.
+  return WebPFrameCacheFlush(cache, verbose, mux);
+}
+
+//------------------------------------------------------------------------------
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -0,0 +1,81 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Helper structs and methods for gif2webp tool.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
+#define WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
+
+#include <stdlib.h>
+
+#include "webp/mux.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Helper utilities.
+
+#define WEBP_UTIL_TRANSPARENT_COLOR 0x00ffffff
+
+struct WebPPicture;
+
+typedef struct {
+  int x_offset, y_offset, width, height;
+} WebPFrameRect;
+
+// Clear pixels in 'picture' within given 'rect' to transparent color.
+void WebPUtilClearPic(struct WebPPicture* const picture,
+                      const WebPFrameRect* const rect);
+
+//------------------------------------------------------------------------------
+// Frame cache.
+
+typedef struct WebPFrameCache WebPFrameCache;
+
+// Given the minimum distance between key frames 'kmin' and maximum distance
+// between key frames 'kmax', returns an appropriately allocated cache object.
+// If 'allow_mixed' is true, the subsequent calls to WebPFrameCacheAddFrame()
+// will heuristically pick lossy or lossless compression for each frame.
+// Use WebPFrameCacheDelete() to deallocate the 'cache'.
+WebPFrameCache* WebPFrameCacheNew(int width, int height,
+                                  size_t kmin, size_t kmax, int allow_mixed);
+
+// Release all the frame data from 'cache' and free 'cache'.
+void WebPFrameCacheDelete(WebPFrameCache* const cache);
+
+// Given an image described by 'frame', 'info' and 'orig_rect', optimize it for
+// WebP, encode it and add it to 'cache'. 'orig_rect' can be NULL.
+// This takes care of frame disposal too, according to 'info->dispose_method'.
+// Returns false in case of error (and sets frame->error_code accordingly).
+int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
+                           const WebPConfig* const config,
+                           const WebPFrameRect* const orig_rect,
+                           WebPPicture* const frame,
+                           WebPMuxFrameInfo* const info);
+
+// Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
+// true, prints the information about these frames.
+WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
+                                 WebPMux* const mux);
+
+// Similar to 'WebPFrameCacheFlushFrames()', but flushes *all* the frames.
+WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
+                                    WebPMux* const mux);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@ -0,0 +1,295 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// JPEG decode.
+
+#include "./jpegdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_JPEG
+#include <jpeglib.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+// -----------------------------------------------------------------------------
+// Metadata processing
+
+#ifndef JPEG_APP1
+# define JPEG_APP1 (JPEG_APP0 + 1)
+#endif
+#ifndef JPEG_APP2
+# define JPEG_APP2 (JPEG_APP0 + 2)
+#endif
+
+typedef struct {
+  const uint8_t* data;
+  size_t data_length;
+  int seq;  // this segment's sequence number [1, 255] for use in reassembly.
+} ICCPSegment;
+
+static void SaveMetadataMarkers(j_decompress_ptr dinfo) {
+  const unsigned int max_marker_length = 0xffff;
+  jpeg_save_markers(dinfo, JPEG_APP1, max_marker_length);  // Exif/XMP
+  jpeg_save_markers(dinfo, JPEG_APP2, max_marker_length);  // ICC profile
+}
+
+static int CompareICCPSegments(const void* a, const void* b) {
+  const ICCPSegment* s1 = (const ICCPSegment*)a;
+  const ICCPSegment* s2 = (const ICCPSegment*)b;
+  return s1->seq - s2->seq;
+}
+
+// Extract ICC profile segments from the marker list in 'dinfo', reassembling
+// and storing them in 'iccp'.
+// Returns true on success and false for memory errors and corrupt profiles.
+static int StoreICCP(j_decompress_ptr dinfo, MetadataPayload* const iccp) {
+  // ICC.1:2010-12 (4.3.0.0) Annex B.4 Embedding ICC Profiles in JPEG files
+  static const char kICCPSignature[] = "ICC_PROFILE";
+  static const size_t kICCPSignatureLength = 12;  // signature includes '\0'
+  static const size_t kICCPSkipLength = 14;  // signature + seq & count
+  int expected_count = 0;
+  int actual_count = 0;
+  int seq_max = 0;
+  size_t total_size = 0;
+  ICCPSegment iccp_segments[255];
+  jpeg_saved_marker_ptr marker;
+
+  memset(iccp_segments, 0, sizeof(iccp_segments));
+  for (marker = dinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker->marker == JPEG_APP2 &&
+        marker->data_length > kICCPSkipLength &&
+        !memcmp(marker->data, kICCPSignature, kICCPSignatureLength)) {
+      // ICC_PROFILE\0<seq><count>; 'seq' starts at 1.
+      const int seq = marker->data[kICCPSignatureLength];
+      const int count = marker->data[kICCPSignatureLength + 1];
+      const size_t segment_size = marker->data_length - kICCPSkipLength;
+      ICCPSegment* segment;
+
+      if (segment_size == 0 || count == 0 || seq == 0) {
+        fprintf(stderr, "[ICCP] size (%d) / count (%d) / sequence number (%d)"
+                        " cannot be 0!\n",
+                (int)segment_size, seq, count);
+        return 0;
+      }
+
+      if (expected_count == 0) {
+        expected_count = count;
+      } else if (expected_count != count) {
+        fprintf(stderr, "[ICCP] Inconsistent segment count (%d / %d)!\n",
+                expected_count, count);
+        return 0;
+      }
+
+      segment = iccp_segments + seq - 1;
+      if (segment->data_length != 0) {
+        fprintf(stderr, "[ICCP] Duplicate segment number (%d)!\n" , seq);
+        return 0;
+      }
+
+      segment->data = marker->data + kICCPSkipLength;
+      segment->data_length = segment_size;
+      segment->seq = seq;
+      total_size += segment_size;
+      if (seq > seq_max) seq_max = seq;
+      ++actual_count;
+    }
+  }
+
+  if (actual_count == 0) return 1;
+  if (seq_max != actual_count) {
+    fprintf(stderr, "[ICCP] Discontinuous segments, expected: %d actual: %d!\n",
+            actual_count, seq_max);
+    return 0;
+  }
+  if (expected_count != actual_count) {
+    fprintf(stderr, "[ICCP] Segment count: %d does not match expected: %d!\n",
+            actual_count, expected_count);
+    return 0;
+  }
+
+  // The segments may appear out of order in the file, sort them based on
+  // sequence number before assembling the payload.
+  qsort(iccp_segments, actual_count, sizeof(*iccp_segments),
+        CompareICCPSegments);
+
+  iccp->bytes = (uint8_t*)malloc(total_size);
+  if (iccp->bytes == NULL) return 0;
+  iccp->size = total_size;
+
+  {
+    int i;
+    size_t offset = 0;
+    for (i = 0; i < seq_max; ++i) {
+      memcpy(iccp->bytes + offset,
+             iccp_segments[i].data, iccp_segments[i].data_length);
+      offset += iccp_segments[i].data_length;
+    }
+  }
+  return 1;
+}
+
+// Returns true on success and false for memory errors and corrupt profiles.
+// The caller must use MetadataFree() on 'metadata' in all cases.
+static int ExtractMetadataFromJPEG(j_decompress_ptr dinfo,
+                                   Metadata* const metadata) {
+  static const struct {
+    int marker;
+    const char* signature;
+    size_t signature_length;
+    size_t storage_offset;
+  } kJPEGMetadataMap[] = {
+    // Exif 2.2 Section 4.7.2 Interoperability Structure of APP1 ...
+    { JPEG_APP1, "Exif\0",                        6, METADATA_OFFSET(exif) },
+    // XMP Specification Part 3 Section 3 Embedding XMP Metadata ... #JPEG
+    // TODO(jzern) Add support for 'ExtendedXMP'
+    { JPEG_APP1, "http://ns.adobe.com/xap/1.0/", 29, METADATA_OFFSET(xmp) },
+    { 0, NULL, 0, 0 },
+  };
+  jpeg_saved_marker_ptr marker;
+  // Treat ICC profiles separately as they may be segmented and out of order.
+  if (!StoreICCP(dinfo, &metadata->iccp)) return 0;
+
+  for (marker = dinfo->marker_list; marker != NULL; marker = marker->next) {
+    int i;
+    for (i = 0; kJPEGMetadataMap[i].marker != 0; ++i) {
+      if (marker->marker == kJPEGMetadataMap[i].marker &&
+          marker->data_length > kJPEGMetadataMap[i].signature_length &&
+          !memcmp(marker->data, kJPEGMetadataMap[i].signature,
+                  kJPEGMetadataMap[i].signature_length)) {
+        MetadataPayload* const payload =
+            (MetadataPayload*)((uint8_t*)metadata +
+                               kJPEGMetadataMap[i].storage_offset);
+
+        if (payload->bytes == NULL) {
+          const char* marker_data = (const char*)marker->data +
+                                    kJPEGMetadataMap[i].signature_length;
+          const size_t marker_data_length =
+              marker->data_length - kJPEGMetadataMap[i].signature_length;
+          if (!MetadataCopy(marker_data, marker_data_length, payload)) return 0;
+        } else {
+          fprintf(stderr, "Ignoring additional '%s' marker\n",
+                  kJPEGMetadataMap[i].signature);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+#undef JPEG_APP1
+#undef JPEG_APP2
+
+// -----------------------------------------------------------------------------
+// JPEG decoding
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+};
+
+static void my_error_exit(j_common_ptr dinfo) {
+  struct my_error_mgr* myerr = (struct my_error_mgr*)dinfo->err;
+  dinfo->err->output_message(dinfo);
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
+  int ok = 0;
+  int stride, width, height;
+  volatile struct jpeg_decompress_struct dinfo;
+  struct my_error_mgr jerr;
+  uint8_t* volatile rgb = NULL;
+  JSAMPROW buffer[1];
+
+  memset((j_decompress_ptr)&dinfo, 0, sizeof(dinfo));   // for setjmp sanity
+  dinfo.err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+
+  if (setjmp(jerr.setjmp_buffer)) {
+ Error:
+    MetadataFree(metadata);
+    jpeg_destroy_decompress((j_decompress_ptr)&dinfo);
+    goto End;
+  }
+
+  jpeg_create_decompress((j_decompress_ptr)&dinfo);
+  jpeg_stdio_src((j_decompress_ptr)&dinfo, in_file);
+  if (metadata != NULL) SaveMetadataMarkers((j_decompress_ptr)&dinfo);
+  jpeg_read_header((j_decompress_ptr)&dinfo, TRUE);
+
+  dinfo.out_color_space = JCS_RGB;
+  dinfo.do_fancy_upsampling = TRUE;
+
+  jpeg_start_decompress((j_decompress_ptr)&dinfo);
+
+  if (dinfo.output_components != 3) {
+    goto Error;
+  }
+
+  width = dinfo.output_width;
+  height = dinfo.output_height;
+  stride = dinfo.output_width * dinfo.output_components * sizeof(*rgb);
+
+  rgb = (uint8_t*)malloc(stride * height);
+  if (rgb == NULL) {
+    goto End;
+  }
+  buffer[0] = (JSAMPLE*)rgb;
+
+  while (dinfo.output_scanline < dinfo.output_height) {
+    if (jpeg_read_scanlines((j_decompress_ptr)&dinfo, buffer, 1) != 1) {
+      goto End;
+    }
+    buffer[0] += stride;
+  }
+
+  if (metadata != NULL) {
+    ok = ExtractMetadataFromJPEG((j_decompress_ptr)&dinfo, metadata);
+    if (!ok) {
+      fprintf(stderr, "Error extracting JPEG metadata!\n");
+      goto Error;
+    }
+  }
+
+  jpeg_finish_decompress((j_decompress_ptr)&dinfo);
+  jpeg_destroy_decompress((j_decompress_ptr)&dinfo);
+
+  // WebP conversion.
+  pic->width = width;
+  pic->height = height;
+  pic->use_argb = 1;      // store raw RGB samples
+  ok = WebPPictureImportRGB(pic, rgb, stride);
+  if (!ok) goto Error;
+
+ End:
+  free(rgb);
+  return ok;
+}
+#else  // !WEBP_HAVE_JPEG
+int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+             struct Metadata* const metadata) {
+  (void)in_file;
+  (void)pic;
+  (void)metadata;
+  fprintf(stderr, "JPEG support not compiled. Please install the libjpeg "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_JPEG
+
+// -----------------------------------------------------------------------------
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@ -0,0 +1,35 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// JPEG decode.
+
+#ifndef WEBP_EXAMPLES_JPEGDEC_H_
+#define WEBP_EXAMPLES_JPEGDEC_H_
+
+#include <stdio.h>
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a JPEG from 'in_file', returning the decoded output in 'pic'.
+// The output is RGB.
+// Returns true on success.
+int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+             struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_JPEGDEC_H_
--- a/examples/metadata.c
+++ b/examples/metadata.c
@ -0,0 +1,49 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Metadata types and functions.
+//
+
+#include "./metadata.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/types.h"
+
+void MetadataInit(Metadata* const metadata) {
+  if (metadata == NULL) return;
+  memset(metadata, 0, sizeof(*metadata));
+}
+
+void MetadataPayloadDelete(MetadataPayload* const payload) {
+  if (payload == NULL) return;
+  free(payload->bytes);
+  payload->bytes = NULL;
+  payload->size = 0;
+}
+
+void MetadataFree(Metadata* const metadata) {
+  if (metadata == NULL) return;
+  MetadataPayloadDelete(&metadata->exif);
+  MetadataPayloadDelete(&metadata->iccp);
+  MetadataPayloadDelete(&metadata->xmp);
+}
+
+int MetadataCopy(const char* metadata, size_t metadata_len,
+                 MetadataPayload* const payload) {
+  if (metadata == NULL || metadata_len == 0 || payload == NULL) return 0;
+  payload->bytes = (uint8_t*)malloc(metadata_len);
+  if (payload->bytes == NULL) return 0;
+  payload->size = metadata_len;
+  memcpy(payload->bytes, metadata, metadata_len);
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
--- a/examples/metadata.h
+++ b/examples/metadata.h
@ -0,0 +1,47 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Metadata types and functions.
+//
+
+#ifndef WEBP_EXAMPLES_METADATA_H_
+#define WEBP_EXAMPLES_METADATA_H_
+
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct MetadataPayload {
+  uint8_t* bytes;
+  size_t size;
+} MetadataPayload;
+
+typedef struct Metadata {
+  MetadataPayload exif;
+  MetadataPayload iccp;
+  MetadataPayload xmp;
+} Metadata;
+
+#define METADATA_OFFSET(x) offsetof(Metadata, x)
+
+void MetadataInit(Metadata* const metadata);
+void MetadataPayloadDelete(MetadataPayload* const payload);
+void MetadataFree(Metadata* const metadata);
+
+// Stores 'metadata' to 'payload->bytes', returns false on allocation error.
+int MetadataCopy(const char* metadata, size_t metadata_len,
+                 MetadataPayload* const payload);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_METADATA_H_
--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@ -0,0 +1,302 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// PNG decode.
+
+#include "./pngdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_PNG
+#include <png.h>
+#include <setjmp.h>   // note: this must be included *after* png.h
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+static void PNGAPI error_function(png_structp png, png_const_charp error) {
+  if (error != NULL) fprintf(stderr, "libpng error: %s\n", error);
+  longjmp(png_jmpbuf(png), 1);
+}
+
+// Converts the NULL terminated 'hexstring' which contains 2-byte character
+// representations of hex values to raw data.
+// 'hexstring' may contain values consisting of [A-F][a-f][0-9] in pairs,
+// e.g., 7af2..., separated by any number of newlines.
+// 'expected_length' is the anticipated processed size.
+// On success the raw buffer is returned with its length equivalent to
+// 'expected_length'. NULL is returned if the processed length is less than
+// 'expected_length' or any character aside from those above is encountered.
+// The returned buffer must be freed by the caller.
+static uint8_t* HexStringToBytes(const char* hexstring,
+                                 size_t expected_length) {
+  const char* src = hexstring;
+  size_t actual_length = 0;
+  uint8_t* const raw_data = (uint8_t*)malloc(expected_length);
+  uint8_t* dst;
+
+  if (raw_data == NULL) return NULL;
+
+  for (dst = raw_data; actual_length < expected_length && *src != '\0'; ++src) {
+    char* end;
+    char val[3];
+    if (*src == '\n') continue;
+    val[0] = *src++;
+    val[1] = *src;
+    val[2] = '\0';
+    *dst++ = (uint8_t)strtol(val, &end, 16);
+    if (end != val + 2) break;
+    ++actual_length;
+  }
+
+  if (actual_length != expected_length) {
+    free(raw_data);
+    return NULL;
+  }
+  return raw_data;
+}
+
+static int ProcessRawProfile(const char* profile, size_t profile_len,
+                             MetadataPayload* const payload) {
+  const char* src = profile;
+  char* end;
+  int expected_length;
+
+  if (profile == NULL || profile_len == 0) return 0;
+
+  // ImageMagick formats 'raw profiles' as
+  // '\n<name>\n<length>(%8lu)\n<hex payload>\n'.
+  if (*src != '\n') {
+    fprintf(stderr, "Malformed raw profile, expected '\\n' got '\\x%.2X'\n",
+            *src);
+    return 0;
+  }
+  ++src;
+  // skip the profile name and extract the length.
+  while (*src != '\0' && *src++ != '\n') {}
+  expected_length = (int)strtol(src, &end, 10);
+  if (*end != '\n') {
+    fprintf(stderr, "Malformed raw profile, expected '\\n' got '\\x%.2X'\n",
+            *end);
+    return 0;
+  }
+  ++end;
+
+  // 'end' now points to the profile payload.
+  payload->bytes = HexStringToBytes(end, expected_length);
+  if (payload->bytes == NULL) return 0;
+  payload->size = expected_length;
+  return 1;
+}
+
+static const struct {
+  const char* name;
+  int (*process)(const char* profile, size_t profile_len,
+                 MetadataPayload* const payload);
+  size_t storage_offset;
+} kPNGMetadataMap[] = {
+  // http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
+  // See also: ExifTool on CPAN.
+  { "Raw profile type exif", ProcessRawProfile, METADATA_OFFSET(exif) },
+  { "Raw profile type xmp",  ProcessRawProfile, METADATA_OFFSET(xmp) },
+  // Exiftool puts exif data in APP1 chunk, too.
+  { "Raw profile type APP1", ProcessRawProfile, METADATA_OFFSET(exif) },
+  // XMP Specification Part 3, Section 3 #PNG
+  { "XML:com.adobe.xmp",     MetadataCopy,      METADATA_OFFSET(xmp) },
+  { NULL, NULL, 0 },
+};
+
+// Looks for metadata at both the beginning and end of the PNG file, giving
+// preference to the head.
+// Returns true on success. The caller must use MetadataFree() on 'metadata' in
+// all cases.
+static int ExtractMetadataFromPNG(png_structp png,
+                                  png_infop const head_info,
+                                  png_infop const end_info,
+                                  Metadata* const metadata) {
+  int p;
+
+  for (p = 0; p < 2; ++p)  {
+    png_infop const info = (p == 0) ? head_info : end_info;
+    png_textp text = NULL;
+    const int num = png_get_text(png, info, &text, NULL);
+    int i;
+    // Look for EXIF / XMP metadata.
+    for (i = 0; i < num; ++i, ++text) {
+      int j;
+      for (j = 0; kPNGMetadataMap[j].name != NULL; ++j) {
+        if (!strcmp(text->key, kPNGMetadataMap[j].name)) {
+          MetadataPayload* const payload =
+              (MetadataPayload*)((uint8_t*)metadata +
+                                 kPNGMetadataMap[j].storage_offset);
+          png_size_t text_length;
+          switch (text->compression) {
+#ifdef PNG_iTXt_SUPPORTED
+            case PNG_ITXT_COMPRESSION_NONE:
+            case PNG_ITXT_COMPRESSION_zTXt:
+              text_length = text->itxt_length;
+              break;
+#endif
+            case PNG_TEXT_COMPRESSION_NONE:
+            case PNG_TEXT_COMPRESSION_zTXt:
+            default:
+              text_length = text->text_length;
+              break;
+          }
+          if (payload->bytes != NULL) {
+            fprintf(stderr, "Ignoring additional '%s'\n", text->key);
+          } else if (!kPNGMetadataMap[j].process(text->text, text_length,
+                                                 payload)) {
+            fprintf(stderr, "Failed to process: '%s'\n", text->key);
+            return 0;
+          }
+          break;
+        }
+      }
+    }
+    // Look for an ICC profile.
+    {
+      png_charp name;
+      int comp_type;
+#if ((PNG_LIBPNG_VER_MAJOR << 8) | PNG_LIBPNG_VER_MINOR << 0) < \
+    ((1 << 8) | (5 << 0))
+      png_charp profile;
+#else  // >= libpng 1.5.0
+      png_bytep profile;
+#endif
+      png_uint_32 len;
+
+      if (png_get_iCCP(png, info,
+                       &name, &comp_type, &profile, &len) == PNG_INFO_iCCP) {
+        if (!MetadataCopy((const char*)profile, len, &metadata->iccp)) return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
+            Metadata* const metadata) {
+  volatile png_structp png;
+  volatile png_infop info = NULL;
+  volatile png_infop end_info = NULL;
+  int color_type, bit_depth, interlaced;
+  int has_alpha;
+  int num_passes;
+  int p;
+  int ok = 0;
+  png_uint_32 width, height, y;
+  int stride;
+  uint8_t* volatile rgb = NULL;
+
+  png = png_create_read_struct(PNG_LIBPNG_VER_STRING, 0, 0, 0);
+  if (png == NULL) {
+    goto End;
+  }
+
+  png_set_error_fn(png, 0, error_function, NULL);
+  if (setjmp(png_jmpbuf(png))) {
+ Error:
+    MetadataFree(metadata);
+    goto End;
+  }
+
+  info = png_create_info_struct(png);
+  if (info == NULL) goto Error;
+  end_info = png_create_info_struct(png);
+  if (end_info == NULL) goto Error;
+
+  png_init_io(png, in_file);
+  png_read_info(png, info);
+  if (!png_get_IHDR(png, info,
+                    &width, &height, &bit_depth, &color_type, &interlaced,
+                    NULL, NULL)) goto Error;
+
+  png_set_strip_16(png);
+  png_set_packing(png);
+  if (color_type == PNG_COLOR_TYPE_PALETTE) {
+    png_set_palette_to_rgb(png);
+  }
+  if (color_type == PNG_COLOR_TYPE_GRAY ||
+      color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+    if (bit_depth < 8) {
+      png_set_expand_gray_1_2_4_to_8(png);
+    }
+    png_set_gray_to_rgb(png);
+  }
+  if (png_get_valid(png, info, PNG_INFO_tRNS)) {
+    png_set_tRNS_to_alpha(png);
+    has_alpha = 1;
+  } else {
+    has_alpha = !!(color_type & PNG_COLOR_MASK_ALPHA);
+  }
+
+  if (!keep_alpha) {
+    png_set_strip_alpha(png);
+    has_alpha = 0;
+  }
+
+  num_passes = png_set_interlace_handling(png);
+  png_read_update_info(png, info);
+  stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
+  rgb = (uint8_t*)malloc(stride * height);
+  if (rgb == NULL) goto Error;
+  for (p = 0; p < num_passes; ++p) {
+    for (y = 0; y < height; ++y) {
+      png_bytep row = (png_bytep)(rgb + y * stride);
+      png_read_rows(png, &row, NULL, 1);
+    }
+  }
+  png_read_end(png, end_info);
+
+  if (metadata != NULL &&
+      !ExtractMetadataFromPNG(png, info, end_info, metadata)) {
+    fprintf(stderr, "Error extracting PNG metadata!\n");
+    goto Error;
+  }
+
+  pic->width = width;
+  pic->height = height;
+  pic->use_argb = 1;
+  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
+                 : WebPPictureImportRGB(pic, rgb, stride);
+
+  if (!ok) {
+    goto Error;
+  }
+
+ End:
+  if (png != NULL) {
+    png_destroy_read_struct((png_structpp)&png,
+                            (png_infopp)&info, (png_infopp)&end_info);
+  }
+  free(rgb);
+  return ok;
+}
+#else  // !WEBP_HAVE_PNG
+int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata) {
+  (void)in_file;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "PNG support not compiled. Please install the libpng "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_PNG
+
+// -----------------------------------------------------------------------------
--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@ -0,0 +1,35 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// PNG decode.
+
+#ifndef WEBP_EXAMPLES_PNGDEC_H_
+#define WEBP_EXAMPLES_PNGDEC_H_
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a PNG from 'in_file', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the PNG has an alpha channel, the output is RGBA
+// otherwise it will be RGB.
+// Returns true on success.
+int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_PNGDEC_H_
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Helper functions to measure elapsed time.
@ -12,12 +14,16 @@
 #ifndef WEBP_EXAMPLES_STOPWATCH_H_
 #define WEBP_EXAMPLES_STOPWATCH_H_

-#ifdef _WIN32
+#if defined _WIN32 && !defined __GNUC__
 #include <windows.h>

 typedef LARGE_INTEGER Stopwatch;

-static inline double StopwatchReadAndReset(Stopwatch* watch) {
+static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
+  QueryPerformanceCounter(watch);
+}
+
+static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const LARGE_INTEGER old_value = *watch;
  LARGE_INTEGER freq;
  if (!QueryPerformanceCounter(watch))
@ -30,18 +36,25 @@ static inline double StopwatchReadAndReset(Stopwatch* watch) {
 }


-#else    // !_WIN32
+#else    /* !_WIN32 */
 #include <sys/time.h>

 typedef struct timeval Stopwatch;

-static inline double StopwatchReadAndReset(Stopwatch* watch) {
-  const struct timeval old_value = *watch;
+static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
  gettimeofday(watch, NULL);
-  return watch->tv_sec - old_value.tv_sec +
-      (watch->tv_usec - old_value.tv_usec) / 1000000.0;
 }

-#endif   // !_WIN32
+static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
+  struct timeval old_value;
+  double delta_sec, delta_usec;
+  memcpy(&old_value, watch, sizeof(old_value));
+  gettimeofday(watch, NULL);
+  delta_sec = (double)watch->tv_sec - old_value.tv_sec;
+  delta_usec = (double)watch->tv_usec - old_value.tv_usec;
+  return delta_sec + delta_usec / 1000000.0;
+}

-#endif  // WEBP_EXAMPLES_STOPWATCH_H_
+#endif   /* _WIN32 */
+
+#endif  /* WEBP_EXAMPLES_STOPWATCH_H_ */
--- a/examples/test.webp
+++ b/examples/test.webp
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@ -0,0 +1,141 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// TIFF decode.
+
+#include "./tiffdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_TIFF
+#include <tiffio.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+static const struct {
+  ttag_t tag;
+  size_t storage_offset;
+} kTIFFMetadataMap[] = {
+  { TIFFTAG_ICCPROFILE, METADATA_OFFSET(iccp) },
+  { TIFFTAG_XMLPACKET,  METADATA_OFFSET(xmp) },
+  { 0, 0 },
+};
+
+// Returns true on success. The caller must use MetadataFree() on 'metadata' in
+// all cases.
+static int ExtractMetadataFromTIFF(TIFF* const tif, Metadata* const metadata) {
+  int i;
+  toff_t exif_ifd_offset;
+
+  for (i = 0; kTIFFMetadataMap[i].tag != 0; ++i) {
+    MetadataPayload* const payload =
+        (MetadataPayload*)((uint8_t*)metadata +
+                           kTIFFMetadataMap[i].storage_offset);
+    void* tag_data;
+    uint32 tag_data_len;
+
+    if (TIFFGetField(tif, kTIFFMetadataMap[i].tag, &tag_data_len, &tag_data) &&
+        !MetadataCopy((const char*)tag_data, tag_data_len, payload)) {
+      return 0;
+    }
+  }
+
+  // TODO(jzern): To extract the raw EXIF directory some parsing of it would be
+  // necessary to determine the overall size. In addition, value offsets in
+  // individual directory entries may need to be updated as, depending on the
+  // type, they are file based.
+  // Exif 2.2 Section 4.6.2 Tag Structure
+  // TIFF Revision 6.0 Part 1 Section 2 TIFF Structure #Image File Directory
+  if (TIFFGetField(tif, TIFFTAG_EXIFIFD, &exif_ifd_offset)) {
+    fprintf(stderr, "Warning: EXIF extraction from TIFF is unsupported.\n");
+  }
+  return 1;
+}
+
+int ReadTIFF(const char* const filename,
+             WebPPicture* const pic, int keep_alpha,
+             Metadata* const metadata) {
+  TIFF* const tif = TIFFOpen(filename, "r");
+  uint32 width, height;
+  uint32* raster;
+  int ok = 0;
+  tdir_t dircount;
+
+  if (tif == NULL) {
+    fprintf(stderr, "Error! Cannot open TIFF file '%s'\n", filename);
+    return 0;
+  }
+
+  dircount = TIFFNumberOfDirectories(tif);
+  if (dircount > 1) {
+    fprintf(stderr, "Warning: multi-directory TIFF files are not supported.\n"
+                    "Only the first will be used, %d will be ignored.\n",
+                    dircount - 1);
+  }
+
+  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
+        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
+    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
+    return 0;
+  }
+  raster = (uint32*)_TIFFmalloc(width * height * sizeof(*raster));
+  if (raster != NULL) {
+    if (TIFFReadRGBAImageOriented(tif, width, height, raster,
+                                  ORIENTATION_TOPLEFT, 1)) {
+      const int stride = width * sizeof(*raster);
+      pic->width = width;
+      pic->height = height;
+      // TIFF data is ABGR
+#ifdef WORDS_BIGENDIAN
+      TIFFSwabArrayOfLong(raster, width * height);
+#endif
+      pic->use_argb = 1;
+      ok = keep_alpha
+         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
+         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
+    }
+    _TIFFfree(raster);
+  } else {
+    fprintf(stderr, "Error allocating TIFF RGBA memory!\n");
+  }
+
+  if (ok) {
+    if (metadata != NULL) {
+      ok = ExtractMetadataFromTIFF(tif, metadata);
+      if (!ok) {
+        fprintf(stderr, "Error extracting TIFF metadata!\n");
+        MetadataFree(metadata);
+        WebPPictureFree(pic);
+      }
+    }
+  }
+
+  TIFFClose(tif);
+  return ok;
+}
+#else  // !WEBP_HAVE_TIFF
+int ReadTIFF(const char* const filename,
+             struct WebPPicture* const pic, int keep_alpha,
+             struct Metadata* const metadata) {
+  (void)filename;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "TIFF support not compiled. Please install the libtiff "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_TIFF
+
+// -----------------------------------------------------------------------------
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@ -0,0 +1,34 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// TIFF decode.
+
+#ifndef WEBP_EXAMPLES_TIFFDEC_H_
+#define WEBP_EXAMPLES_TIFFDEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a TIFF from 'filename', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the TIFF has an alpha channel, the output is RGBA
+// otherwise it will be RGB.
+// Returns true on success.
+int ReadTIFF(const char* const filename,
+             struct WebPPicture* const pic, int keep_alpha,
+             struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_TIFFDEC_H_
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -0,0 +1,553 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Simple OpenGL-based WebP file viewer.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(WEBP_HAVE_GL)
+
+#if defined(HAVE_GLUT_GLUT_H)
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#ifdef FREEGLUT
+#include <GL/freeglut.h>
+#endif
+#endif
+
+#ifdef WEBP_HAVE_QCMS
+#include <qcms.h>
+#endif
+
+#include "webp/decode.h"
+#include "webp/demux.h"
+
+#include "./example_util.h"
+
+#ifdef _MSC_VER
+#define snprintf _snprintf
+#endif
+
+static void Help(void);
+
+// Unfortunate global variables. Gathered into a struct for comfort.
+static struct {
+  int has_animation;
+  int has_color_profile;
+  int done;
+  int decoding_error;
+  int print_info;
+  int use_color_profile;
+
+  int canvas_width, canvas_height;
+  int loop_count;
+  uint32_t bg_color;
+
+  const char* file_name;
+  WebPData data;
+  WebPDecoderConfig config;
+  const WebPDecBuffer* pic;
+  WebPDemuxer* dmux;
+  WebPIterator curr_frame;
+  WebPIterator prev_frame;
+  WebPChunkIterator iccp;
+} kParams;
+
+static void ClearPreviousPic(void) {
+  WebPFreeDecBuffer((WebPDecBuffer*)kParams.pic);
+  kParams.pic = NULL;
+}
+
+static void ClearParams(void) {
+  ClearPreviousPic();
+  WebPDataClear(&kParams.data);
+  WebPDemuxReleaseIterator(&kParams.curr_frame);
+  WebPDemuxReleaseIterator(&kParams.prev_frame);
+  WebPDemuxReleaseChunkIterator(&kParams.iccp);
+  WebPDemuxDelete(kParams.dmux);
+  kParams.dmux = NULL;
+}
+
+// -----------------------------------------------------------------------------
+// Color profile handling
+static int ApplyColorProfile(const WebPData* const profile,
+                             WebPDecBuffer* const rgba) {
+#ifdef WEBP_HAVE_QCMS
+  int i, ok = 0;
+  uint8_t* line;
+  uint8_t major_revision;
+  qcms_profile* input_profile = NULL;
+  qcms_profile* output_profile = NULL;
+  qcms_transform* transform = NULL;
+  const qcms_data_type input_type = QCMS_DATA_RGBA_8;
+  const qcms_data_type output_type = QCMS_DATA_RGBA_8;
+  const qcms_intent intent = QCMS_INTENT_DEFAULT;
+
+  if (profile == NULL || rgba == NULL) return 0;
+  if (profile->bytes == NULL || profile->size < 10) return 1;
+  major_revision = profile->bytes[8];
+
+  qcms_enable_iccv4();
+  input_profile = qcms_profile_from_memory(profile->bytes, profile->size);
+  // qcms_profile_is_bogus() is broken with ICCv4.
+  if (input_profile == NULL ||
+      (major_revision < 4 && qcms_profile_is_bogus(input_profile))) {
+    fprintf(stderr, "Color profile is bogus!\n");
+    goto Error;
+  }
+
+  output_profile = qcms_profile_sRGB();
+  if (output_profile == NULL) {
+    fprintf(stderr, "Error creating output color profile!\n");
+    goto Error;
+  }
+
+  qcms_profile_precache_output_transform(output_profile);
+  transform = qcms_transform_create(input_profile, input_type,
+                                    output_profile, output_type,
+                                    intent);
+  if (transform == NULL) {
+    fprintf(stderr, "Error creating color transform!\n");
+    goto Error;
+  }
+
+  line = rgba->u.RGBA.rgba;
+  for (i = 0; i < rgba->height; ++i, line += rgba->u.RGBA.stride) {
+    qcms_transform_data(transform, line, line, rgba->width);
+  }
+  ok = 1;
+
+ Error:
+  if (input_profile != NULL) qcms_profile_release(input_profile);
+  if (output_profile != NULL) qcms_profile_release(output_profile);
+  if (transform != NULL) qcms_transform_release(transform);
+  return ok;
+#else
+  (void)profile;
+  (void)rgba;
+  return 1;
+#endif  // WEBP_HAVE_QCMS
+}
+
+//------------------------------------------------------------------------------
+// File decoding
+
+static int Decode(void) {   // Fills kParams.curr_frame
+  const WebPIterator* const curr = &kParams.curr_frame;
+  WebPDecoderConfig* const config = &kParams.config;
+  WebPDecBuffer* const output_buffer = &config->output;
+  int ok = 0;
+
+  ClearPreviousPic();
+  output_buffer->colorspace = MODE_RGBA;
+  ok = (WebPDecode(curr->fragment.bytes, curr->fragment.size,
+                   config) == VP8_STATUS_OK);
+  if (!ok) {
+    fprintf(stderr, "Decoding of frame #%d failed!\n", curr->frame_num);
+  } else {
+    kParams.pic = output_buffer;
+    if (kParams.use_color_profile) {
+      ok = ApplyColorProfile(&kParams.iccp.chunk, output_buffer);
+      if (!ok) {
+        fprintf(stderr, "Applying color profile to frame #%d failed!\n",
+                curr->frame_num);
+      }
+    }
+  }
+  return ok;
+}
+
+static void decode_callback(int what) {
+  if (what == 0 && !kParams.done) {
+    int duration = 0;
+    if (kParams.dmux != NULL) {
+      WebPIterator* const curr = &kParams.curr_frame;
+      if (!WebPDemuxNextFrame(curr)) {
+        WebPDemuxReleaseIterator(curr);
+        if (WebPDemuxGetFrame(kParams.dmux, 1, curr)) {
+          --kParams.loop_count;
+          kParams.done = (kParams.loop_count == 0);
+        } else {
+          kParams.decoding_error = 1;
+          kParams.done = 1;
+          return;
+        }
+      }
+      duration = curr->duration;
+    }
+    if (!Decode()) {
+      kParams.decoding_error = 1;
+      kParams.done = 1;
+    } else {
+      glutPostRedisplay();
+      glutTimerFunc(duration, decode_callback, what);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Callbacks
+
+static void HandleKey(unsigned char key, int pos_x, int pos_y) {
+  (void)pos_x;
+  (void)pos_y;
+  if (key == 'q' || key == 'Q' || key == 27 /* Esc */) {
+#ifdef FREEGLUT
+    glutLeaveMainLoop();
+#else
+    ClearParams();
+    exit(0);
+#endif
+  } else if (key == 'c') {
+    if (kParams.has_color_profile && !kParams.decoding_error) {
+      kParams.use_color_profile = 1 - kParams.use_color_profile;
+
+      if (kParams.has_animation) {
+        // Restart the completed animation to pickup the color profile change.
+        if (kParams.done && kParams.loop_count == 0) {
+          kParams.loop_count =
+              (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT) + 1;
+          kParams.done = 0;
+          // Start the decode loop immediately.
+          glutTimerFunc(0, decode_callback, 0);
+        }
+      } else {
+        Decode();
+        glutPostRedisplay();
+      }
+    }
+  } else if (key == 'i') {
+    kParams.print_info = 1 - kParams.print_info;
+    glutPostRedisplay();
+  }
+}
+
+static void HandleReshape(int width, int height) {
+  // TODO(skal): proper handling of resize, esp. for large pictures.
+  // + key control of the zoom.
+  glViewport(0, 0, width, height);
+  glMatrixMode(GL_PROJECTION);
+  glLoadIdentity();
+  glMatrixMode(GL_MODELVIEW);
+  glLoadIdentity();
+}
+
+static void PrintString(const char* const text) {
+  void* const font = GLUT_BITMAP_9_BY_15;
+  int i;
+  for (i = 0; text[i]; ++i) {
+    glutBitmapCharacter(font, text[i]);
+  }
+}
+
+static float GetColorf(uint32_t color, int shift) {
+  return (color >> shift) / 255.f;
+}
+
+static void DrawCheckerBoard(void) {
+  const int square_size = 8;  // must be a power of 2
+  int x, y;
+  GLint viewport[4];  // x, y, width, height
+
+  glPushMatrix();
+
+  glGetIntegerv(GL_VIEWPORT, viewport);
+  // shift to integer coordinates with (0,0) being top-left.
+  glOrtho(0, viewport[2], viewport[3], 0, -1, 1);
+  for (y = 0; y < viewport[3]; y += square_size) {
+    for (x = 0; x < viewport[2]; x += square_size) {
+      const GLubyte color = 128 + 64 * (!((x + y) & square_size));
+      glColor3ub(color, color, color);
+      glRecti(x, y, x + square_size, y + square_size);
+    }
+  }
+  glPopMatrix();
+}
+
+static void HandleDisplay(void) {
+  const WebPDecBuffer* const pic = kParams.pic;
+  const WebPIterator* const curr = &kParams.curr_frame;
+  WebPIterator* const prev = &kParams.prev_frame;
+  GLfloat xoff, yoff;
+  if (pic == NULL) return;
+  glPushMatrix();
+  glPixelZoom(1, -1);
+  xoff = (GLfloat)(2. * curr->x_offset / kParams.canvas_width);
+  yoff = (GLfloat)(2. * curr->y_offset / kParams.canvas_height);
+  glRasterPos2f(-1.f + xoff, 1.f - yoff);
+  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+  glPixelStorei(GL_UNPACK_ROW_LENGTH, pic->u.RGBA.stride / 4);
+
+  if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ||
+      curr->blend_method == WEBP_MUX_NO_BLEND) {
+    // TODO(later): these offsets and those above should factor in window size.
+    //              they will be incorrect if the window is resized.
+    // glScissor() takes window coordinates (0,0 at bottom left).
+    int window_x, window_y;
+    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+      // Clear the previous frame rectangle.
+      window_x = prev->x_offset;
+      window_y = kParams.canvas_height - prev->y_offset - prev->height;
+    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
+      // We simulate no-blending behavior by first clearing the current frame
+      // rectangle (to a checker-board) and then alpha-blending against it.
+      window_x = curr->x_offset;
+      window_y = kParams.canvas_height - curr->y_offset - curr->height;
+    }
+    glEnable(GL_SCISSOR_TEST);
+    // Only update the requested area, not the whole canvas.
+    glScissor(window_x, window_y, prev->width, prev->height);
+
+    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
+    DrawCheckerBoard();
+
+    glDisable(GL_SCISSOR_TEST);
+  }
+
+  *prev = *curr;
+
+  glDrawPixels(pic->width, pic->height,
+               GL_RGBA, GL_UNSIGNED_BYTE,
+               (GLvoid*)pic->u.RGBA.rgba);
+  if (kParams.print_info) {
+    char tmp[32];
+
+    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
+    glRasterPos2f(-0.95f, 0.90f);
+    PrintString(kParams.file_name);
+
+    snprintf(tmp, sizeof(tmp), "Dimension:%d x %d", pic->width, pic->height);
+    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
+    glRasterPos2f(-0.95f, 0.80f);
+    PrintString(tmp);
+    if (curr->x_offset != 0 || curr->y_offset != 0) {
+      snprintf(tmp, sizeof(tmp), " (offset:%d,%d)",
+               curr->x_offset, curr->y_offset);
+      glRasterPos2f(-0.95f, 0.70f);
+      PrintString(tmp);
+    }
+  }
+  glPopMatrix();
+  glFlush();
+}
+
+static void StartDisplay(void) {
+  const int width = kParams.canvas_width;
+  const int height = kParams.canvas_height;
+  glutInitDisplayMode(GLUT_RGBA);
+  glutInitWindowSize(width, height);
+  glutCreateWindow("WebP viewer");
+  glutDisplayFunc(HandleDisplay);
+  glutIdleFunc(NULL);
+  glutKeyboardFunc(HandleKey);
+  glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+  glEnable(GL_BLEND);
+  glClearColor(GetColorf(kParams.bg_color, 0),
+               GetColorf(kParams.bg_color, 8),
+               GetColorf(kParams.bg_color, 16),
+               GetColorf(kParams.bg_color, 24));
+  HandleReshape(width, height);
+  glClear(GL_COLOR_BUFFER_BIT);
+  DrawCheckerBoard();
+}
+
+//------------------------------------------------------------------------------
+// Main
+
+static void Help(void) {
+  printf("Usage: vwebp in_file [options]\n\n"
+         "Decodes the WebP image file and visualize it using OpenGL\n"
+         "Options are:\n"
+         "  -version  .... print version number and exit\n"
+         "  -noicc ....... don't use the icc profile if present\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
+         "  -nofilter .... disable in-loop filtering\n"
+         "  -dither <int>  dithering strength (0..100), default=50\n"
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+         "  -noalphadither disable alpha plane dithering\n"
+#endif
+         "  -mt .......... use multi-threading\n"
+         "  -info ........ print info\n"
+         "  -h     ....... this help message\n"
+         "\n"
+         "Keyboard shortcuts:\n"
+         "  'c' ................ toggle use of color profile\n"
+         "  'i' ................ overlay file information\n"
+         "  'q' / 'Q' / ESC .... quit\n"
+        );
+}
+
+int main(int argc, char *argv[]) {
+  int c;
+  WebPDecoderConfig* const config = &kParams.config;
+  WebPIterator* const curr = &kParams.curr_frame;
+  WebPIterator* const prev = &kParams.prev_frame;
+
+  if (!WebPInitDecoderConfig(config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return -1;
+  }
+  config->options.dithering_strength = 50;
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+  config->options.alpha_dithering_strength = 100;
+#endif
+  kParams.use_color_profile = 1;
+
+  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
+    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      Help();
+      return 0;
+    } else if (!strcmp(argv[c], "-noicc")) {
+      kParams.use_color_profile = 0;
+    } else if (!strcmp(argv[c], "-nofancy")) {
+      config->options.no_fancy_upsampling = 1;
+    } else if (!strcmp(argv[c], "-nofilter")) {
+      config->options.bypass_filtering = 1;
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+    } else if (!strcmp(argv[c], "-noalphadither")) {
+      config->options.alpha_dithering_strength = 0;
+#endif
+    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
+      config->options.dithering_strength =
+          ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-info")) {
+      kParams.print_info = 1;
+    } else if (!strcmp(argv[c], "-version")) {
+      const int dec_version = WebPGetDecoderVersion();
+      const int dmux_version = WebPGetDemuxVersion();
+      printf("WebP Decoder version: %d.%d.%d\nWebP Demux version: %d.%d.%d\n",
+             (dec_version >> 16) & 0xff, (dec_version >> 8) & 0xff,
+             dec_version & 0xff, (dmux_version >> 16) & 0xff,
+             (dmux_version >> 8) & 0xff, dmux_version & 0xff);
+      return 0;
+    } else if (!strcmp(argv[c], "-mt")) {
+      config->options.use_threads = 1;
+    } else if (!strcmp(argv[c], "--")) {
+      if (c < argc - 1) kParams.file_name = argv[++c];
+      break;
+    } else if (argv[c][0] == '-') {
+      printf("Unknown option '%s'\n", argv[c]);
+      Help();
+      return -1;
+    } else {
+      kParams.file_name = argv[c];
+    }
+
+    if (parse_error) {
+      Help();
+      return -1;
+    }
+  }
+
+  if (kParams.file_name == NULL) {
+    printf("missing input file!!\n");
+    Help();
+    return 0;
+  }
+
+  if (!ExUtilReadFile(kParams.file_name,
+                      &kParams.data.bytes, &kParams.data.size)) {
+    goto Error;
+  }
+
+  if (!WebPGetInfo(kParams.data.bytes, kParams.data.size, NULL, NULL)) {
+    fprintf(stderr, "Input file doesn't appear to be WebP format.\n");
+    goto Error;
+  }
+
+  kParams.dmux = WebPDemux(&kParams.data);
+  if (kParams.dmux == NULL) {
+    fprintf(stderr, "Could not create demuxing object!\n");
+    goto Error;
+  }
+
+  if (WebPDemuxGetI(kParams.dmux, WEBP_FF_FORMAT_FLAGS) & FRAGMENTS_FLAG) {
+    fprintf(stderr, "Image fragments are not supported for now!\n");
+    goto Error;
+  }
+  kParams.canvas_width = WebPDemuxGetI(kParams.dmux, WEBP_FF_CANVAS_WIDTH);
+  kParams.canvas_height = WebPDemuxGetI(kParams.dmux, WEBP_FF_CANVAS_HEIGHT);
+  if (kParams.print_info) {
+    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
+  }
+
+  prev->width = kParams.canvas_width;
+  prev->height = kParams.canvas_height;
+  prev->x_offset = prev->y_offset = 0;
+  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+
+  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
+  kParams.has_color_profile =
+      !!(WebPDemuxGetI(kParams.dmux, WEBP_FF_FORMAT_FLAGS) & ICCP_FLAG);
+  if (kParams.has_color_profile) {
+#ifdef WEBP_HAVE_QCMS
+    if (!WebPDemuxGetChunk(kParams.dmux, "ICCP", 1, &kParams.iccp)) goto Error;
+    printf("VP8X: Found color profile\n");
+#else
+    fprintf(stderr, "Warning: color profile present, but qcms is unavailable!\n"
+            "Build libqcms from Mozilla or Chromium and define WEBP_HAVE_QCMS "
+            "before building.\n");
+#endif
+  }
+
+  if (!WebPDemuxGetFrame(kParams.dmux, 1, curr)) goto Error;
+
+  kParams.has_animation = (curr->num_frames > 1);
+  kParams.loop_count = (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT);
+  kParams.bg_color = WebPDemuxGetI(kParams.dmux, WEBP_FF_BACKGROUND_COLOR);
+  printf("VP8X: Found %d images in file (loop count = %d)\n",
+         curr->num_frames, kParams.loop_count);
+
+  // Decode first frame
+  if (!Decode()) goto Error;
+
+  // Position iterator to last frame. Next call to HandleDisplay will wrap over.
+  // We take this into account by bumping up loop_count.
+  WebPDemuxGetFrame(kParams.dmux, 0, curr);
+  if (kParams.loop_count) ++kParams.loop_count;
+
+  // Start display (and timer)
+  glutInit(&argc, argv);
+#ifdef FREEGLUT
+  glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_CONTINUE_EXECUTION);
+#endif
+  StartDisplay();
+
+  if (kParams.has_animation) glutTimerFunc(0, decode_callback, 0);
+  glutMainLoop();
+
+  // Should only be reached when using FREEGLUT:
+  ClearParams();
+  return 0;
+
+ Error:
+  ClearParams();
+  return -1;
+}
+
+#else   // !WEBP_HAVE_GL
+
+int main(int argc, const char *argv[]) {
+  fprintf(stderr, "OpenGL support not enabled in %s.\n", argv[0]);
+  (void)argc;
+  return 0;
+}
+
+#endif
+
+//------------------------------------------------------------------------------
--- a/examples/webpdec.c
+++ b/examples/webpdec.c
@ -0,0 +1,67 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebP decode.
+
+#include "./webpdec.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "webp/decode.h"
+#include "webp/encode.h"
+#include "./example_util.h"
+#include "./metadata.h"
+
+int ReadWebP(const char* const in_file, WebPPicture* const pic,
+             int keep_alpha, Metadata* const metadata) {
+  int ok = 0;
+  size_t data_size = 0;
+  const uint8_t* data = NULL;
+  VP8StatusCode status = VP8_STATUS_OK;
+  WebPDecoderConfig config;
+  WebPDecBuffer* const output_buffer = &config.output;
+  WebPBitstreamFeatures* const bitstream = &config.input;
+
+  // TODO(jzern): add Exif/XMP/ICC extraction.
+  if (metadata != NULL) {
+    fprintf(stderr, "Warning: metadata extraction from WebP is unsupported.\n");
+  }
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return 0;
+  }
+
+  if (ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
+    const int has_alpha = keep_alpha && bitstream->has_alpha;
+    output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;
+
+    status = ExUtilDecodeWebP(data, data_size, 0, &config);
+    if (status == VP8_STATUS_OK) {
+      const uint8_t* const rgba = output_buffer->u.RGBA.rgba;
+      const int stride = output_buffer->u.RGBA.stride;
+      pic->width = output_buffer->width;
+      pic->height = output_buffer->height;
+      pic->use_argb = 1;
+      ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
+                     : WebPPictureImportRGB(pic, rgba, stride);
+    }
+  }
+
+  if (status != VP8_STATUS_OK) {
+    ExUtilPrintWebPError(in_file, status);
+  }
+
+  free((void*)data);
+  WebPFreeDecBuffer(output_buffer);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
--- a/examples/webpdec.h
+++ b/examples/webpdec.h
@ -0,0 +1,33 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebP decode.
+
+#ifndef WEBP_EXAMPLES_WEBPDEC_H_
+#define WEBP_EXAMPLES_WEBPDEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a WebP from 'in_file', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the WebP has an alpha channel, the output is
+// RGBA otherwise it will be RGB.
+// Returns true on success.
+int ReadWebP(const char* const in_file, struct WebPPicture* const pic,
+             int keep_alpha, struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_WEBPDEC_H_
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -0,0 +1,349 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Windows Imaging Component (WIC) decode.
+
+#include "./wicdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef HAVE_WINCODEC_H
+#ifdef __MINGW32__
+#define INITGUID  // Without this GUIDs are declared extern and fail to link
+#endif
+#define CINTERFACE
+#define COBJMACROS
+#define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
+                         // code with COBJMACROS.
+#include <shlwapi.h>
+#include <windows.h>
+#include <wincodec.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+#define IFS(fn)                                                     \
+  do {                                                              \
+    if (SUCCEEDED(hr)) {                                            \
+      hr = (fn);                                                    \
+      if (FAILED(hr)) fprintf(stderr, #fn " failed %08lx\n", hr);   \
+    }                                                               \
+  } while (0)
+
+// modified version of DEFINE_GUID from guiddef.h.
+#define WEBP_DEFINE_GUID(name, l, w1, w2, b1, b2, b3, b4, b5, b6, b7, b8) \
+  static const GUID name = \
+      { l, w1, w2, { b1, b2,  b3,  b4,  b5,  b6,  b7,  b8 } }
+
+#ifdef __cplusplus
+#define MAKE_REFGUID(x) (x)
+#else
+#define MAKE_REFGUID(x) &(x)
+#endif
+
+typedef struct WICFormatImporter {
+  const GUID* pixel_format;
+  int bytes_per_pixel;
+  int (*import)(WebPPicture* const, const uint8_t* const, int);
+} WICFormatImporter;
+
+// From Microsoft SDK 7.0a -- wincodec.h
+// Create local copies for compatibility when building against earlier
+// versions of the SDK.
+WEBP_DEFINE_GUID(GUID_WICPixelFormat24bppBGR_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0c);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat24bppRGB_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0d);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0f);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
+                 0xf5c7ad2d, 0x6a8d, 0x43dd,
+                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+
+static HRESULT OpenInputStream(const char* filename, IStream** stream) {
+  HRESULT hr = S_OK;
+  IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  if (FAILED(hr)) {
+    fprintf(stderr, "Error opening input file %s (%08lx)\n", filename, hr);
+  }
+  return hr;
+}
+
+// -----------------------------------------------------------------------------
+// Metadata processing
+
+// Stores the first non-zero sized color profile from 'frame' to 'iccp'.
+// Returns an HRESULT to indicate success or failure. The caller is responsible
+// for freeing 'iccp->bytes' in either case.
+static HRESULT ExtractICCP(IWICImagingFactory* const factory,
+                           IWICBitmapFrameDecode* const frame,
+                           MetadataPayload* const iccp) {
+  HRESULT hr = S_OK;
+  UINT i, count;
+  IWICColorContext** color_contexts;
+
+  IFS(IWICBitmapFrameDecode_GetColorContexts(frame, 0, NULL, &count));
+  if (FAILED(hr) || count == 0) return hr;
+
+  color_contexts = (IWICColorContext**)calloc(count, sizeof(*color_contexts));
+  if (color_contexts == NULL) return E_OUTOFMEMORY;
+  for (i = 0; SUCCEEDED(hr) && i < count; ++i) {
+    IFS(IWICImagingFactory_CreateColorContext(factory, &color_contexts[i]));
+  }
+
+  if (SUCCEEDED(hr)) {
+    UINT num_color_contexts;
+    IFS(IWICBitmapFrameDecode_GetColorContexts(frame,
+                                               count, color_contexts,
+                                               &num_color_contexts));
+    for (i = 0; SUCCEEDED(hr) && i < num_color_contexts; ++i) {
+      WICColorContextType type;
+      IFS(IWICColorContext_GetType(color_contexts[i], &type));
+      if (SUCCEEDED(hr) && type == WICColorContextProfile) {
+        UINT size;
+        IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
+                                             0, NULL, &size));
+        if (size > 0) {
+          iccp->bytes = (uint8_t*)malloc(size);
+          if (iccp->bytes == NULL) {
+            hr = E_OUTOFMEMORY;
+            break;
+          }
+          iccp->size = size;
+          IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
+                                               (UINT)iccp->size, iccp->bytes,
+                                               &size));
+          if (SUCCEEDED(hr) && size != iccp->size) {
+            fprintf(stderr, "Warning! ICC profile size (%u) != expected (%u)\n",
+                    size, (uint32_t)iccp->size);
+            iccp->size = size;
+          }
+          break;
+        }
+      }
+    }
+  }
+  for (i = 0; i < count; ++i) {
+    if (color_contexts[i] != NULL) IUnknown_Release(color_contexts[i]);
+  }
+  free(color_contexts);
+  return hr;
+}
+
+static HRESULT ExtractMetadata(IWICImagingFactory* const factory,
+                               IWICBitmapFrameDecode* const frame,
+                               Metadata* const metadata) {
+  // TODO(jzern): add XMP/EXIF extraction.
+  const HRESULT hr = ExtractICCP(factory, frame, &metadata->iccp);
+  if (FAILED(hr)) MetadataFree(metadata);
+  return hr;
+}
+
+// -----------------------------------------------------------------------------
+
+static int HasPalette(GUID pixel_format) {
+  return (IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat1bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat2bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat4bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat8bppIndexed)));
+}
+
+static int HasAlpha(IWICImagingFactory* const factory,
+                    IWICBitmapDecoder* const decoder,
+                    IWICBitmapFrameDecode* const frame,
+                    GUID pixel_format) {
+  int has_alpha;
+  if (HasPalette(pixel_format)) {
+    IWICPalette* frame_palette = NULL;
+    IWICPalette* global_palette = NULL;
+    BOOL frame_palette_has_alpha = FALSE;
+    BOOL global_palette_has_alpha = FALSE;
+
+    // A palette may exist at the frame or container level,
+    // check IWICPalette::HasAlpha() for both if present.
+    if (SUCCEEDED(IWICImagingFactory_CreatePalette(factory, &frame_palette)) &&
+        SUCCEEDED(IWICBitmapFrameDecode_CopyPalette(frame, frame_palette))) {
+      IWICPalette_HasAlpha(frame_palette, &frame_palette_has_alpha);
+    }
+    if (SUCCEEDED(IWICImagingFactory_CreatePalette(factory, &global_palette)) &&
+        SUCCEEDED(IWICBitmapDecoder_CopyPalette(decoder, global_palette))) {
+      IWICPalette_HasAlpha(global_palette, &global_palette_has_alpha);
+    }
+    has_alpha = frame_palette_has_alpha || global_palette_has_alpha;
+
+    if (frame_palette != NULL) IUnknown_Release(frame_palette);
+    if (global_palette != NULL) IUnknown_Release(global_palette);
+  } else {
+    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+  }
+  return has_alpha;
+}
+
+int ReadPictureWithWIC(const char* const filename,
+                       WebPPicture* const pic, int keep_alpha,
+                       Metadata* const metadata) {
+  // From Microsoft SDK 6.0a -- ks.h
+  // Define a local copy to avoid link errors under mingw.
+  WEBP_DEFINE_GUID(GUID_NULL_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  static const WICFormatImporter kAlphaFormatImporters[] = {
+    { &GUID_WICPixelFormat32bppBGRA_, 4, WebPPictureImportBGRA },
+    { &GUID_WICPixelFormat32bppRGBA_, 4, WebPPictureImportRGBA },
+    { NULL, 0, NULL },
+  };
+  static const WICFormatImporter kNonAlphaFormatImporters[] = {
+    { &GUID_WICPixelFormat24bppBGR_, 3, WebPPictureImportBGR },
+    { &GUID_WICPixelFormat24bppRGB_, 3, WebPPictureImportRGB },
+    { NULL, 0, NULL },
+  };
+  HRESULT hr = S_OK;
+  IWICBitmapFrameDecode* frame = NULL;
+  IWICFormatConverter* converter = NULL;
+  IWICImagingFactory* factory = NULL;
+  IWICBitmapDecoder* decoder = NULL;
+  IStream* stream = NULL;
+  UINT frame_count = 0;
+  UINT width = 0, height = 0;
+  BYTE* rgb = NULL;
+  WICPixelFormatGUID src_pixel_format = GUID_WICPixelFormatUndefined;
+  const WICFormatImporter* importer = NULL;
+  GUID src_container_format = GUID_NULL_;
+  static const GUID* kAlphaContainers[] = {
+    &GUID_ContainerFormatBmp,
+    &GUID_ContainerFormatPng,
+    &GUID_ContainerFormatTiff,
+    NULL
+  };
+  int has_alpha = 0;
+  int stride;
+
+  IFS(CoInitialize(NULL));
+  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
+                       CLSCTX_INPROC_SERVER,
+                       MAKE_REFGUID(IID_IWICImagingFactory),
+                       (LPVOID*)&factory));
+  if (hr == REGDB_E_CLASSNOTREG) {
+    fprintf(stderr,
+            "Couldn't access Windows Imaging Component (are you running "
+            "Windows XP SP3 or newer?). Most formats not available. "
+            "Use -s for the available YUV input.\n");
+  }
+  // Prepare for image decoding.
+  IFS(OpenInputStream(filename, &stream));
+  IFS(IWICImagingFactory_CreateDecoderFromStream(
+          factory, stream, NULL,
+          WICDecodeMetadataCacheOnDemand, &decoder));
+  IFS(IWICBitmapDecoder_GetFrameCount(decoder, &frame_count));
+  if (SUCCEEDED(hr) && frame_count == 0) {
+    fprintf(stderr, "No frame found in input file.\n");
+    hr = E_FAIL;
+  }
+  IFS(IWICBitmapDecoder_GetFrame(decoder, 0, &frame));
+  IFS(IWICBitmapFrameDecode_GetPixelFormat(frame, &src_pixel_format));
+  IFS(IWICBitmapDecoder_GetContainerFormat(decoder, &src_container_format));
+
+  if (keep_alpha) {
+    const GUID** guid;
+    for (guid = kAlphaContainers; *guid != NULL; ++guid) {
+      if (IsEqualGUID(MAKE_REFGUID(src_container_format),
+                      MAKE_REFGUID(**guid))) {
+        has_alpha = HasAlpha(factory, decoder, frame, src_pixel_format);
+        break;
+      }
+    }
+  }
+
+  // Prepare for pixel format conversion (if necessary).
+  IFS(IWICImagingFactory_CreateFormatConverter(factory, &converter));
+
+  for (importer = has_alpha ? kAlphaFormatImporters : kNonAlphaFormatImporters;
+       hr == S_OK && importer->import != NULL; ++importer) {
+    BOOL can_convert;
+    const HRESULT cchr = IWICFormatConverter_CanConvert(
+        converter,
+        MAKE_REFGUID(src_pixel_format),
+        MAKE_REFGUID(*importer->pixel_format),
+        &can_convert);
+    if (SUCCEEDED(cchr) && can_convert) break;
+  }
+  if (importer->import == NULL) hr = E_FAIL;
+
+  IFS(IWICFormatConverter_Initialize(converter, (IWICBitmapSource*)frame,
+                                     importer->pixel_format,
+                                     WICBitmapDitherTypeNone,
+                                     NULL, 0.0, WICBitmapPaletteTypeCustom));
+
+  // Decode.
+  IFS(IWICFormatConverter_GetSize(converter, &width, &height));
+  stride = importer->bytes_per_pixel * width * sizeof(*rgb);
+  if (SUCCEEDED(hr)) {
+    rgb = (BYTE*)malloc(stride * height);
+    if (rgb == NULL)
+      hr = E_OUTOFMEMORY;
+  }
+  IFS(IWICFormatConverter_CopyPixels(converter, NULL,
+                                     stride, stride * height, rgb));
+
+  // WebP conversion.
+  if (SUCCEEDED(hr)) {
+    int ok;
+    pic->width = width;
+    pic->height = height;
+    pic->use_argb = 1;
+    ok = importer->import(pic, rgb, stride);
+    if (!ok) hr = E_FAIL;
+  }
+  if (SUCCEEDED(hr)) {
+    if (metadata != NULL) {
+      hr = ExtractMetadata(factory, frame, metadata);
+      if (FAILED(hr)) {
+        fprintf(stderr, "Error extracting image metadata using WIC!\n");
+      }
+    }
+  }
+
+  // Cleanup.
+  if (converter != NULL) IUnknown_Release(converter);
+  if (frame != NULL) IUnknown_Release(frame);
+  if (decoder != NULL) IUnknown_Release(decoder);
+  if (factory != NULL) IUnknown_Release(factory);
+  if (stream != NULL) IUnknown_Release(stream);
+  free(rgb);
+  return SUCCEEDED(hr);
+}
+#else  // !HAVE_WINCODEC_H
+int ReadPictureWithWIC(const char* const filename,
+                       struct WebPPicture* const pic, int keep_alpha,
+                       struct Metadata* const metadata) {
+  (void)filename;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "Windows Imaging Component (WIC) support not compiled. "
+                  "Visual Studio and mingw-w64 builds support WIC. Make sure "
+                  "wincodec.h detection is working correctly if using autoconf "
+                  "and HAVE_WINCODEC_H is defined before building.\n");
+  return 0;
+}
+#endif  // HAVE_WINCODEC_H
+
+// -----------------------------------------------------------------------------
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@ -0,0 +1,34 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Windows Imaging Component (WIC) decode.
+
+#ifndef WEBP_EXAMPLES_WICDEC_H_
+#define WEBP_EXAMPLES_WICDEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads an image from 'filename', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the image has an alpha channel, the output is
+// RGBA otherwise it will be RGB.
+// Returns true on success.
+int ReadPictureWithWIC(const char* const filename,
+                       struct WebPPicture* const pic, int keep_alpha,
+                       struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_WICDEC_H_
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -0,0 +1,117 @@
+#!/bin/bash
+#
+# This script generates 'WebP.framework'. An iOS app can decode WebP images
+# by including 'WebP.framework'.
+#
+# Run ./iosbuild.sh to generate 'WebP.framework' under the current directory
+# (previous build will be erased if it exists).
+#
+# This script is inspired by the build script written by Carson McDonald.
+# (http://www.ioncannon.net/programming/1483/using-webp-to-reduce-native-ios-app-size/).
+
+set -e
+
+# Extract the latest SDK version from the final field of the form: iphoneosX.Y
+readonly SDK=$(xcodebuild -showsdks \
+  | grep iphoneos | sort | tail -n 1 | awk '{print substr($NF, 9)}'
+)
+# Extract Xcode version.
+readonly XCODE=$(xcodebuild -version | grep Xcode | cut -d " " -f2)
+if [[ -z "${XCODE}" ]]; then
+  echo "Xcode not available"
+  exit 1
+fi
+
+readonly OLDPATH=${PATH}
+
+# Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
+# Note that iPhoneOS-V6 support is not available with the iOS6 SDK.
+readonly PLATFORMS="iPhoneSimulator iPhoneOS-V7 iPhoneOS-V7s iPhoneOS-V7-arm64"
+readonly SRCDIR=$(dirname $0)
+readonly TOPDIR=$(pwd)
+readonly BUILDDIR="${TOPDIR}/iosbuild"
+readonly TARGETDIR="${TOPDIR}/WebP.framework"
+readonly DEVELOPER=$(xcode-select --print-path)
+readonly PLATFORMSROOT="${DEVELOPER}/Platforms"
+readonly LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+LIBLIST=''
+
+if [[ -z "${SDK}" ]]; then
+  echo "iOS SDK not available"
+  exit 1
+elif [[ ${SDK} < 6.0 ]]; then
+  echo "You need iOS SDK version 6.0 or above"
+  exit 1
+else
+  echo "iOS SDK Version ${SDK}"
+fi
+
+rm -rf ${BUILDDIR}
+rm -rf ${TARGETDIR}
+mkdir -p ${BUILDDIR}
+mkdir -p ${TARGETDIR}/Headers/
+
+if [[ ! -e ${SRCDIR}/configure ]]; then
+  if ! (cd ${SRCDIR} && sh autogen.sh); then
+    cat <<EOT
+Error creating configure script!
+This script requires the autoconf/automake and libtool to build. MacPorts can
+be used to obtain these:
+http://www.macports.org/install.php
+EOT
+    exit 1
+  fi
+fi
+
+for PLATFORM in ${PLATFORMS}; do
+  ARCH2=""
+  if [[ "${PLATFORM}" == "iPhoneOS-V7-arm64" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="aarch64"
+    ARCH2="arm64"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V7s" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv7s"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V7" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv7"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V6" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv6"
+  else
+    ARCH="i386"
+  fi
+
+  ROOTDIR="${BUILDDIR}/${PLATFORM}-${SDK}-${ARCH}"
+  mkdir -p "${ROOTDIR}"
+
+  DEVROOT="${DEVELOPER}/Toolchains/XcodeDefault.xctoolchain"
+  SDKROOT="${PLATFORMSROOT}/"
+  SDKROOT+="${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
+  CFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT} -O3 -DNDEBUG"
+  CFLAGS+=" -miphoneos-version-min=6.0"
+
+  set -x
+  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
+  ${SRCDIR}/configure --host=${ARCH}-apple-darwin --prefix=${ROOTDIR} \
+    --build=$(${SRCDIR}/config.guess) \
+    --disable-shared --enable-static \
+    --enable-libwebpdecoder --enable-swap-16bit-csp \
+    CFLAGS="${CFLAGS}"
+  set +x
+
+  # run make only in the src/ directory to create libwebpdecoder.a
+  cd src/
+  make V=0
+  make install
+
+  LIBLIST+=" ${ROOTDIR}/lib/libwebpdecoder.a"
+
+  make clean
+  cd ..
+
+  export PATH=${OLDPATH}
+done
+
+cp -a ${SRCDIR}/src/webp/*.h ${TARGETDIR}/Headers/
+${LIPO} -create ${LIBLIST} -output ${TARGETDIR}/WebP
--- a/m4/ax_pthread.m4
+++ b/m4/ax_pthread.m4
@ -0,0 +1,332 @@
+# ===========================================================================
+#        http://www.gnu.org/software/autoconf-archive/ax_pthread.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro figures out how to build C programs using POSIX threads. It
+#   sets the PTHREAD_LIBS output variable to the threads library and linker
+#   flags, and the PTHREAD_CFLAGS output variable to any special C compiler
+#   flags that are needed. (The user can also force certain compiler
+#   flags/libs to be tested by setting these environment variables.)
+#
+#   Also sets PTHREAD_CC to any special C compiler that is needed for
+#   multi-threaded programs (defaults to the value of CC otherwise). (This
+#   is necessary on AIX to use the special cc_r compiler alias.)
+#
+#   NOTE: You are assumed to not only compile your program with these flags,
+#   but also link it with them as well. e.g. you should link with
+#   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+#
+#   If you are only building threads programs, you may wish to use these
+#   variables in your default LIBS, CFLAGS, and CC:
+#
+#     LIBS="$PTHREAD_LIBS $LIBS"
+#     CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+#     CC="$PTHREAD_CC"
+#
+#   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
+#   has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name
+#   (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#
+#   Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
+#   PTHREAD_PRIO_INHERIT symbol is defined when compiling with
+#   PTHREAD_CFLAGS.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a threads library
+#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
+#   is not found. If ACTION-IF-FOUND is not specified, the default action
+#   will define HAVE_PTHREAD.
+#
+#   Please let the authors know if this macro fails on any platform, or if
+#   you have any other suggestions or comments. This macro was based on work
+#   by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help
+#   from M. Frigo), as well as ac_pthread and hb_pthread macros posted by
+#   Alejandro Forero Cuervo to the autoconf macro repository. We are also
+#   grateful for the helpful feedback of numerous users.
+#
+#   Updated for Autoconf 2.68 by Daniel Richard G.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#   Copyright (c) 2011 Daniel Richard G. <skunk@iSKUNK.ORG>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 21
+
+AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
+AC_DEFUN([AX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+AC_LANG_PUSH([C])
+ax_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on True64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
+        AC_TRY_LINK_FUNC([pthread_join], [ax_pthread_ok=yes])
+        AC_MSG_RESULT([$ax_pthread_ok])
+        if test x"$ax_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all, and "pthread-config"
+# which is a program returning the flags for the Pth emulation library.
+
+ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case ${host_os} in
+        solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        ax_pthread_flags="-pthreads pthread -mt -pthread $ax_pthread_flags"
+        ;;
+
+        darwin*)
+        ax_pthread_flags="-pthread $ax_pthread_flags"
+        ;;
+esac
+
+# Clang doesn't consider unrecognized options an error unless we specify
+# -Werror. We throw in some extra Clang-specific options to ensure that
+# this doesn't happen for GCC, which also accepts -Werror.
+
+AC_MSG_CHECKING([if compiler needs -Werror to reject unknown flags])
+save_CFLAGS="$CFLAGS"
+ax_pthread_extra_flags="-Werror"
+CFLAGS="$CFLAGS $ax_pthread_extra_flags -Wunknown-warning-option -Wsizeof-array-argument"
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([int foo(void);],[foo()])],
+                  [AC_MSG_RESULT([yes])],
+                  [ax_pthread_extra_flags=
+                   AC_MSG_RESULT([no])])
+CFLAGS="$save_CFLAGS"
+
+if test x"$ax_pthread_ok" = xno; then
+for flag in $ax_pthread_flags; do
+
+        case $flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $flag])
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+                pthread-config)
+                AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no])
+                if test x"$ax_pthread_config" = xno; then continue; fi
+                PTHREAD_CFLAGS="`pthread-config --cflags`"
+                PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+                ;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$flag])
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS $ax_pthread_extra_flags"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>
+                        static void routine(void *a) { a = 0; }
+                        static void *start_routine(void *a) { return a; }],
+                       [pthread_t th; pthread_attr_t attr;
+                        pthread_create(&th, 0, start_routine, 0);
+                        pthread_join(th, 0);
+                        pthread_attr_init(&attr);
+                        pthread_cleanup_push(routine, 0);
+                        pthread_cleanup_pop(0) /* ; */])],
+                [ax_pthread_ok=yes],
+                [])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        AC_MSG_RESULT([$ax_pthread_ok])
+        if test "x$ax_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$ax_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+        AC_MSG_CHECKING([for joinable pthread attribute])
+        attr_name=unknown
+        for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+            AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>],
+                           [int attr = $attr; return attr /* ; */])],
+                [attr_name=$attr; break],
+                [])
+        done
+        AC_MSG_RESULT([$attr_name])
+        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
+            AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], [$attr_name],
+                               [Define to necessary symbol if this constant
+                                uses a non-standard name on your system.])
+        fi
+
+        AC_MSG_CHECKING([if more special flags are required for pthreads])
+        flag=no
+        case ${host_os} in
+            aix* | freebsd* | darwin*) flag="-D_THREAD_SAFE";;
+            osf* | hpux*) flag="-D_REENTRANT";;
+            solaris*)
+            if test "$GCC" = "yes"; then
+                flag="-D_REENTRANT"
+            else
+                # TODO: What about Clang on Solaris?
+                flag="-mt -D_REENTRANT"
+            fi
+            ;;
+        esac
+        AC_MSG_RESULT([$flag])
+        if test "x$flag" != xno; then
+            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
+            [ax_cv_PTHREAD_PRIO_INHERIT], [
+                AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],
+                                                [[int i = PTHREAD_PRIO_INHERIT;]])],
+                    [ax_cv_PTHREAD_PRIO_INHERIT=yes],
+                    [ax_cv_PTHREAD_PRIO_INHERIT=no])
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes"],
+            [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: compile with *_r variant
+        if test "x$GCC" != xyes; then
+            case $host_os in
+                aix*)
+                AS_CASE(["x/$CC"],
+                  [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
+                  [#handle absolute path differently from PATH based program lookup
+                   AS_CASE(["x$CC"],
+                     [x/*],
+                     [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])],
+                     [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])])
+                ;;
+            esac
+        fi
+fi
+
+test -n "$PTHREAD_CC" || PTHREAD_CC="$CC"
+
+AC_SUBST([PTHREAD_LIBS])
+AC_SUBST([PTHREAD_CFLAGS])
+AC_SUBST([PTHREAD_CC])
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_pthread_ok" = xyes; then
+        ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1])
+        :
+else
+        ax_pthread_ok=no
+        $2
+fi
+AC_LANG_POP
+])dnl AX_PTHREAD
--- a/makefile.unix
+++ b/makefile.unix
@ -2,7 +2,8 @@
 # system, for simple local building of the libraries and tools.
 # It will not install the libraries system-wide, but just create the 'cwebp'
 # and 'dwebp' tools in the examples/ directory, along with the static
-# library 'src/libwebp.a'.
+# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a' and
+# 'src/demux/libwebpdemux.a'.
 #
 # To build the library and examples, use:
 #    make -f makefile.unix
@ -10,79 +11,334 @@

 #### Customizable part ####

-# These flag assume you have libpng and libjpeg installed. If not, either
-# follow below install instructions or just comment out the next lines.
-EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG
-EXTRA_LIBS= -lpng -ljpeg
-ifeq ("$(HOSTTYPE)", "intel-mac")
+# These flags assume you have libpng, libjpeg, libtiff and libgif installed. If
+# not, either follow the install instructions below or just comment out the next
+# four lines.
+EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG -DWEBP_HAVE_TIFF
+DWEBP_LIBS= -lpng -lz
+CWEBP_LIBS= $(DWEBP_LIBS) -ljpeg -ltiff
+GIF_LIBS = -lgif
+
+ifeq ($(strip $(shell uname)), Darwin)
+  # Work around a problem linking tables marked as common symbols,
+  # cf., src/enc/yuv.[hc]
+  # Failure observed with: gcc 4.2.1 and 4.0.1.
+  EXTRA_FLAGS += -fno-common
+  EXTRA_FLAGS += -DHAVE_GLUT_GLUT_H
  EXTRA_FLAGS += -I/opt/local/include
  EXTRA_LIBS  += -L/opt/local/lib
+  GL_LIBS = -framework GLUT -framework OpenGL
+else
+  GL_LIBS = -lglut -lGL
 endif

+
 # To install libraries on Mac OS X:
 # 1. Install MacPorts (http://www.macports.org/install.php)
 # 2. Run "sudo port install jpeg"
 # 3. Run "sudo port install libpng"
+# 4. Run "sudo port install tiff"
+# 5. Run "sudo port install giflib"

 # To install libraries on Linux:
 # 1. Run "sudo apt-get install libjpeg62-dev"
 # 2. Run "sudo apt-get install libpng12-dev"
+# 3. Run "sudo apt-get install libtiff4-dev"
+# 4. Run "sudo apt-get install libgif-dev"

 # Uncomment for build for 32bit platform
 # Alternatively, you can just use the command
 # 'make -f makefile.unix EXTRA_FLAGS=-m32' to that effect.
 # EXTRA_FLAGS += -m32

+# Extra flags to enable experimental features and code
+# EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES
+
+# Extra flags to enable byte swap for 16 bit colorspaces.
+# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP
+
+# Extra flags to enable multi-threading
+EXTRA_FLAGS += -DWEBP_USE_THREAD
+EXTRA_LIBS += -lpthread
+
 # Extra flags to emulate C89 strictness with the full ANSI
 EXTRA_FLAGS += -Wextra -Wold-style-definition
 EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
+EXTRA_FLAGS += -Wshadow
+# EXTRA_FLAGS += -Wvla
+
+# AVX2-specific flags:
+ifeq ($(HAVE_AVX2), 1)
+EXTRA_FLAGS += -DWEBP_HAVE_AVX2
+src/dsp/%_avx2.o: EXTRA_FLAGS += -mavx2
+endif
+
+# NEON-specific flags:
+# EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
+# -> seems to make the overall lib slower: -fno-split-wide-types

 #### Nothing should normally be changed below this line ####

-CC = gcc -Isrc/ -Iexamples/ -Wall
+AR = ar
+ARFLAGS = r
+CC = gcc
+CPPFLAGS = -Isrc/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
-LDFLAGS = src/libwebp.a $(EXTRA_LIBS) -lm
+INSTALL = install
+GROFF = /usr/bin/groff
+COL = /usr/bin/col
+LDFLAGS = $(EXTRA_LIBS) $(EXTRA_FLAGS) -lm

-OBJS = src/enc/webpenc.o src/enc/bit_writer.o src/enc/syntax.o \
-       src/enc/dsp.o src/enc/tree.o src/enc/config.o src/enc/frame.o \
-       src/enc/quant.o src/enc/iterator.o src/enc/analysis.o \
-       src/enc/cost.o src/enc/picture.o src/enc/filter.o \
-       src/dec/bits.o src/dec/dsp.o src/dec/frame.o src/dec/webp.o \
-       src/dec/quant.o src/dec/tree.o src/dec/vp8.o src/dec/yuv.o \
-       src/dec/idec.o
-HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/bit_writer.h \
-       src/enc/cost.h src/dec/bits.h  src/dec/vp8i.h src/dec/yuv.h
-OUTPUT = examples/cwebp examples/dwebp src/libwebp.a
+DEC_OBJS = \
+    src/dec/alpha.o \
+    src/dec/buffer.o \
+    src/dec/frame.o \
+    src/dec/idec.o \
+    src/dec/io.o \
+    src/dec/quant.o \
+    src/dec/tree.o \
+    src/dec/vp8.o \
+    src/dec/vp8l.o \
+    src/dec/webp.o \

-all:ex
+DEMUX_OBJS = \
+    src/demux/demux.o \

-.c.o:  $(HDRS)
-	$(CC) $(CFLAGS) -c $< -o $@
+DSP_DEC_OBJS = \
+    src/dsp/alpha_processing.o \
+    src/dsp/alpha_processing_sse2.o \
+    src/dsp/cpu.o \
+    src/dsp/dec.o \
+    src/dsp/dec_clip_tables.o \
+    src/dsp/dec_mips32.o \
+    src/dsp/dec_neon.o \
+    src/dsp/dec_sse2.o \
+    src/dsp/lossless.o \
+    src/dsp/lossless_mips32.o \
+    src/dsp/lossless_neon.o \
+    src/dsp/lossless_sse2.o \
+    src/dsp/upsampling.o \
+    src/dsp/upsampling_neon.o \
+    src/dsp/upsampling_sse2.o \
+    src/dsp/yuv.o \
+    src/dsp/yuv_mips32.o \
+    src/dsp/yuv_sse2.o \

-libwebp.a:  $(OBJS) $(HDRS)
-	ar r src/libwebp.a $(OBJS)
+DSP_ENC_OBJS = \
+    src/dsp/enc.o \
+    src/dsp/enc_avx2.o \
+    src/dsp/enc_mips32.o \
+    src/dsp/enc_neon.o \
+    src/dsp/enc_sse2.o \

-ex: examples/cwebp.o examples/dwebp.o libwebp.a
-	$(CC) -o examples/cwebp examples/cwebp.o $(LDFLAGS)
-	$(CC) -o examples/dwebp examples/dwebp.o $(LDFLAGS)
+ENC_OBJS = \
+    src/enc/alpha.o \
+    src/enc/analysis.o \
+    src/enc/backward_references.o \
+    src/enc/config.o \
+    src/enc/cost.o \
+    src/enc/filter.o \
+    src/enc/frame.o \
+    src/enc/histogram.o \
+    src/enc/iterator.o \
+    src/enc/picture.o \
+    src/enc/picture_csp.o \
+    src/enc/picture_psnr.o \
+    src/enc/picture_rescale.o \
+    src/enc/picture_tools.o \
+    src/enc/quant.o \
+    src/enc/syntax.o \
+    src/enc/token.o \
+    src/enc/tree.o \
+    src/enc/vp8l.o \
+    src/enc/webpenc.o \
+
+EX_FORMAT_DEC_OBJS = \
+    examples/jpegdec.o \
+    examples/metadata.o \
+    examples/pngdec.o \
+    examples/tiffdec.o \
+    examples/webpdec.o \
+
+EX_UTIL_OBJS = \
+    examples/example_util.o \
+
+GIF2WEBP_UTIL_OBJS = \
+    examples/gif2webp_util.o \
+
+MUX_OBJS = \
+    src/mux/muxedit.o \
+    src/mux/muxinternal.o \
+    src/mux/muxread.o \
+
+UTILS_DEC_OBJS = \
+    src/utils/bit_reader.o \
+    src/utils/color_cache.o \
+    src/utils/filters.o \
+    src/utils/huffman.o \
+    src/utils/quant_levels_dec.o \
+    src/utils/random.o \
+    src/utils/rescaler.o \
+    src/utils/thread.o \
+    src/utils/utils.o \
+
+UTILS_ENC_OBJS = \
+    src/utils/bit_writer.o \
+    src/utils/huffman_encode.o \
+    src/utils/quant_levels.o \
+
+LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
+               $(UTILS_ENC_OBJS)
+LIBWEBPMUX_OBJS = $(MUX_OBJS)
+LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)
+
+HDRS_INSTALLED = \
+    src/webp/decode.h \
+    src/webp/demux.h \
+    src/webp/encode.h \
+    src/webp/mux.h \
+    src/webp/mux_types.h \
+    src/webp/types.h \
+
+HDRS = \
+    src/dec/alphai.h \
+    src/dec/decode_vp8.h \
+    src/dec/vp8i.h \
+    src/dec/vp8li.h \
+    src/dec/webpi.h \
+    src/dsp/dsp.h \
+    src/dsp/lossless.h \
+    src/dsp/neon.h \
+    src/dsp/yuv.h \
+    src/dsp/yuv_tables_sse2.h \
+    src/enc/backward_references.h \
+    src/enc/cost.h \
+    src/enc/histogram.h \
+    src/enc/vp8enci.h \
+    src/enc/vp8li.h \
+    src/mux/muxi.h \
+    src/utils/bit_reader.h \
+    src/utils/bit_writer.h \
+    src/utils/color_cache.h \
+    src/utils/filters.h \
+    src/utils/huffman.h \
+    src/utils/huffman_encode.h \
+    src/utils/quant_levels.h \
+    src/utils/quant_levels_dec.h \
+    src/utils/random.h \
+    src/utils/rescaler.h \
+    src/utils/thread.h \
+    src/utils/utils.h \
+    src/webp/format_constants.h \
+    $(HDRS_INSTALLED) \
+
+OUT_LIBS = examples/libexample_util.a src/libwebpdecoder.a src/libwebp.a
+OUT_EXAMPLES = examples/cwebp examples/dwebp
+EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux
+
+OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
+ifeq ($(MAKECMDGOALS),clean)
+  OUTPUT += $(EXTRA_EXAMPLES)
+  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
+  OUTPUT += examples/libgif2webp_util.a
+endif
+
+ex: $(OUT_EXAMPLES)
+all: ex $(EXTRA_EXAMPLES)
+
+$(EX_FORMAT_DEC_OBJS): %.o: %.h
+
+# special dependencies:
+#   tree.c/vp8.c/bit_reader.c <-> bit_reader_inl.h, endian_inl.h
+#   bit_writer.c <-> endian_inl.h
+src/dec/tree.o: src/utils/bit_reader_inl.h src/utils/endian_inl.h
+src/dec/vp8.o: src/utils/bit_reader_inl.h src/utils/endian_inl.h
+src/utils/bit_reader.o: src/utils/bit_reader_inl.h src/utils/endian_inl.h
+src/utils/bit_writer.o: src/utils/endian_inl.h
+
+%.o: %.c $(HDRS)
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@
+
+examples/libexample_util.a: $(EX_UTIL_OBJS)
+examples/libgif2webp_util.a: $(GIF2WEBP_UTIL_OBJS)
+src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
+src/libwebp.a: $(LIBWEBP_OBJS)
+src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
+src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)
+
+%.a:
+	$(AR) $(ARFLAGS) $@ $^
+
+examples/cwebp: examples/cwebp.o $(EX_FORMAT_DEC_OBJS)
+examples/dwebp: examples/dwebp.o
+examples/gif2webp: examples/gif2webp.o
+examples/vwebp: examples/vwebp.o
+examples/webpmux: examples/webpmux.o
+
+examples/cwebp: examples/libexample_util.a src/libwebp.a
+examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
+examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
+examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
+examples/gif2webp: examples/libexample_util.a examples/libgif2webp_util.a
+examples/gif2webp: src/mux/libwebpmux.a src/libwebp.a
+examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
+examples/gif2webp: EXTRA_FLAGS += -DWEBP_HAVE_GIF
+examples/vwebp: examples/libexample_util.a src/demux/libwebpdemux.a
+examples/vwebp: src/libwebp.a
+examples/vwebp: EXTRA_LIBS += $(GL_LIBS)
+examples/vwebp: EXTRA_FLAGS += -DWEBP_HAVE_GL
+examples/webpmux: examples/libexample_util.a src/mux/libwebpmux.a
+examples/webpmux: src/libwebpdecoder.a
+
+$(OUT_EXAMPLES) $(EXTRA_EXAMPLES):
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+dist: DESTDIR := dist
+dist: OUT_EXAMPLES += $(EXTRA_EXAMPLES)
+dist: all
+	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
+	           $(DESTDIR)/bin $(DESTDIR)/doc $(DESTDIR)/lib
+	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)/bin
+	$(INSTALL) -m644 $(HDRS_INSTALLED) $(DESTDIR)/include/webp
+	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
+	$(INSTALL) -m644 src/demux/libwebpdemux.a $(DESTDIR)/lib
+	$(INSTALL) -m644 src/mux/libwebpmux.a $(DESTDIR)/lib
+	umask 022; \
+	for m in man/[cdv]webp.1 man/gif2webp.1 man/webpmux.1; do \
+	  basenam=$$(basename $$m .1); \
+	  $(GROFF) -t -e -man -T utf8 $$m \
+	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
+	  $(GROFF) -t -e -man -T html $$m \
+	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.html; \
+	done

 clean:
-	rm -f ${OUTPUT} *~ \
-              src/enc/*.o src/enc/*~ \
+	$(RM) $(OUTPUT) *~ \
+              examples/*.o examples/*~ \
              src/dec/*.o src/dec/*~ \
-              examples/*.o examples/*~
+              src/demux/*.o src/demux/*~ \
+              src/dsp/*.o src/dsp/*~ \
+              src/enc/*.o src/enc/*~ \
+              src/mux/*.o src/mux/*~ \
+              src/utils/*.o src/utils/*~ \
+              src/webp/*~ man/*~ doc/*~ swig/*~ \

 superclean: clean
-	rm -rf .git *.log *.cache *~
-	rm -rf .deps */.deps */*/.deps
-	rm -rf .libs */.libs */*/.libs
-	rm -f */*.lo */*/*.lo
-	rm -f */*.la */*/*.la
-	rm -f Makefile */Makefile */*/Makefile
-	rm -f Makefile.in */Makefile.in */*/Makefile.in
-	rm -f config.log autom4te.cache libtool config.h stamp-h1
-	rm -f aclocal.m4 compile config.guess config.h.in config.sub config.status
-	rm -f configure depcomp install-sh ltmain.sh missing src/libwebp.pc
-	rm -f m4/*
+	$(RM) -r .git *.log *.cache *~
+	$(RM) -r .deps */.deps */*/.deps
+	$(RM) -r .libs */.libs */*/.libs
+	$(RM) */*.lo */*/*.lo
+	$(RM) */*.la */*/*.la
+	$(RM) Makefile */Makefile */*/Makefile
+	$(RM) Makefile.in */Makefile.in */*/Makefile.in
+	$(RM) config.log autom4te.cache libtool config.h stamp-h1
+	$(RM) aclocal.m4 compile
+	$(RM) config.guess config.h.in config.sub config.status
+	$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
+	$(RM) m4/*
+
+.PHONY: all clean dist ex superclean
+.SUFFIXES:
--- a/man/Makefile.am
+++ b/man/Makefile.am
@ -1,2 +1,11 @@
 man_MANS = cwebp.1 dwebp.1
+if WANT_MUX
+  man_MANS += webpmux.1
+endif
+if BUILD_GIF2WEBP
+  man_MANS += gif2webp.1
+endif
+if BUILD_VWEBP
+  man_MANS += vwebp.1
+endif
 EXTRA_DIST = $(man_MANS)
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,26 +1,30 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "March  28, 2011"
+.TH CWEBP 1 "Oct 13, 2014"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
 .B cwebp
-.RI [ options ] " input_file -o output_file.webp
+.RI [ options ] " input_file \-o output_file.webp
 .br
 .SH DESCRIPTION
 This manual page documents the
 .B cwebp
 command.
 .PP
-\fBcwebp\fP compresses image using the WebP format.
-Input format can be either PNG, JPEG, or raw Y'CbCr samples.
-When using PNG, the transparency information (alpha channel) is currently
-discarded.
+\fBcwebp\fP compresses an image using the WebP format.
+Input format can be either PNG, JPEG, TIFF, WebP or raw Y'CbCr samples.
 .SH OPTIONS
 The basic options are:
 .TP
-.B \-o string
+.BI \-o " string
 Specify the name of the output WebP file. If omitted, \fBcwebp\fP will
 perform compression but only report statistics.
+Using "\-" as output name will direct output to 'stdout'.
+.TP
+.BI \-\- " string
+Explicitly specify the input file. This option is useful if the input
+file starts with an '\-' for instance. This option must appear \fBlast\fP.
+Any other options afterward will be ignored.
 .TP
 .B \-h, \-help
 A short usage summary.
@ -31,41 +35,80 @@ A summary of all the possible options.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
-.B \-q float
-Specify the compression factor between 0 and 100. Small factor
-produce smaller file with lower quality. Best quality is achieved
-using a value of 100. The default is 75.
+.BI \-q " float
+Specify the compression factor for RGB channels between 0 and 100. The default
+is 75.
+.br
+In case of lossy compression (default), a small factor produces a smaller file
+with lower quality. Best quality is achieved by using a value of 100.
+.br
+In case of lossless compression (specified by the \-lossless option), a small
+factor enables faster compression speed, but produces a larger file. Maximum
+compression is achieved by using a value of 100.
+.\" TODO(jzern): restore post-v0.4.1
+.\" .TP
+.\" .BI \-z " int
+.\" Switch on \fBlossless\fP compression mode with the specified level between 0
+.\" and 9, with level 0 being the fastest, 9 being the slowest. Fast mode
+.\" produces larger file size than slower ones. A good default is \-z 6.
+.\" This option is actually a shortcut for some predefined settings for quality
+.\" and method. If options \-q  or \-m are subsequently used, they will invalidate
+.\" the effect of this \-z option.
 .TP
-.B \-f int
+.BI \-alpha_q " int
+Specify the compression factor for alpha compression between 0 and 100.
+Lossless compression of alpha is achieved using a value of 100, while the lower
+values result in a lossy compression. The default is 100.
+.TP
+.BI \-f " int
 Specify the strength of the deblocking filter, between 0 (no filtering)
 and 100 (maximum filtering). A value of 0 will turn off any filtering.
 Higher value will increase the strength of the filtering process applied
-after decoding the picture. The higher the smoother the picture will
+after decoding the picture. The higher the value the smoother the picture will
 appear. Typical values are usually in the range of 20 to 50.
 .TP
-.B \-preset string
+.BI \-preset " string
 Specify a set of pre-defined parameters to suit a particular type of
 source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
-\fB\-preset\fP overwrites the other parameter's values (except the
+\fB\-preset\fP overwrites the other parameters' values (except the
 \fB\-q\fP one), this option should preferably appear first in the
 order of the arguments.
 .TP
-.B \-sns int
+.BI \-sns " int
 Specify the amplitude of the spatial noise shaping. Spatial noise shaping
 (or \fBsns\fP for short) refers to a general collection of built-in algorithms
 used to decide which area of the picture should use relatively less bits,
 and where else to better transfer these bits. The possible range goes from
 0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
 .TP
-.B \-m int
+.BI \-m " int
 Specify the compression method to use. This parameter controls the
-tradeoff between encoding speed and the compressed file size and quality.
+trade off between encoding speed and the compressed file size and quality.
 Possible values range from 0 to 6. Default value is 4.
 When higher values are used, the encoder will spend more time inspecting
 additional encoding possibilities and decide on the quality gain.
-Lower value can result is faster processing time at the expense of
-larger filesize and lower compression quality.
+Lower value can result in faster processing time at the expense of
+larger file size and lower compression quality.
+.TP
+.B \-jpeg_like
+Change the internal parameter mapping to better match the expected size
+of JPEG compression. This flag will generally produce an output file of
+similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
+with less visual distortion.
+.TP
+.B \-mt
+Use multi-threading for encoding, if possible. This option is only effective
+when using lossy compression on a source with a transparency channel.
+.TP
+.B \-low_memory
+Reduce memory usage of lossy encoding by saving four times the compressed
+size (typically). This will make the encoding slower and the output slightly
+different in size and distortion. This flag is only effective for methods
+3 and up, and is off by default. Note that leaving this flag off will have
+some side effects on the bitstream: it forces certain bitstream features
+like number of partitions (forced to 1). Note that a more detailed report
+of bitstream size is printed by \fBcwebp\fP when using this option.
 .TP
 .B \-af
 Turns auto-filter on. This algorithm will spend additional time optimizing
@ -74,73 +117,173 @@ the filtering strength to reach a well-balanced quality.
 .SH ADDITIONAL OPTIONS
 More advanced options are:
 .TP
-.B \-sharpness int
+.BI \-sharpness " int
 Specify the sharpness of the filtering (if used).
 Range is 0 (sharpest) to 7 (least sharp). Default is 0.
 .TP
 .B \-strong
-Use a stronger filtering than the default one (if filtering is being
-used thanks to the \fB\-f\fP option). Strong filtering is off by default.
+Use strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option). Strong filtering is on by default.
 .TP
-.B \-segments int
+.B \-nostrong
+Disable strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option) and use simple filtering instead.
+.TP
+.BI \-segments " int
 Change the number of partitions to use during the segmentation of the
 sns algorithm. Segments should be in range 1 to 4. Default value is 4.
+This option has no effect for methods 3 and up, unless \fB\-low_memory\fP
+is used.
 .TP
-.B \-size int
+.BI \-partition_limit " int
+Degrade quality by limiting the number of bits used by some macroblocks.
+Range is 0 (no degradation, the default) to 100 (full degradation).
+Useful values are usually around 30-70 for moderately large images.
+In the VP8 format, the so-called control partition has a limit of 512k and
+is used to store the following information: whether the macroblock is skipped,
+which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
+mode, and finally the prediction modes to use for each of the sub-blocks.
+For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
+The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
+information can use up almost all these 4 bits (although the case is unlikely),
+which is problematic for very large images. The partition_limit factor controls
+how frequently the most bit-costly mode (intra 4x4) will be used. This is
+useful in case the 512k limit is reached and the following message is displayed:
+\fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
+If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
+should use less segments in order to save more header bits per macroblock.
+See the \fB-segments\fP option.
+.TP
+.BI \-size " int
 Specify a target size (in bytes) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
 close as possible to this target.
 .TP
-.B \-psnr float
+.BI \-psnr " float
 Specify a target PSNR (in dB) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
 close as possible to this target.
 .TP
-.B \-pass int
-Set a maximum number of pass to use during the dichotomy used by
+.BI \-pass " int
+Set a maximum number of passes to use during the dichotomy used by
 options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
-.B \-crop x_position y_position width height
+.BI \-resize " width height
+Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect-ratio.
+.TP
+.BI \-crop " x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
-(x_position, y_position) and size width x height. This cropping area must
-be fully contained within the source rectangle.
-.B \-s width height
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
+.TP
+.BI \-s " width height
 Specify that the input file actually consists of raw Y'CbCr samples following
 the ITU-R BT.601 recommendation, in 4:2:0 linear format.
 The luma plane has size \fBwidth\fP x \fBheight\fP.
 .TP
-.B \-map int
+.BI \-map " int
 Output additional ASCII-map of encoding information. Possible map values
 range from 1 to 6. This is only meant to help debugging.
 .TP
-.B \-pre int
-Specify a pre-processing filter. This option is a placeholder
-and has currently no effect.
+.BI \-pre " int
+Specify some pre-processing steps. Using a value of '2' will trigger
+quality-dependent pseudo-random dithering during RGBA->YUVA conversion
+(lossy compression only).
+.TP
+.BI \-alpha_filter " string
+Specify the predictive filtering method for the alpha plane. One of 'none',
+\&'fast' or 'best', in increasing complexity and slowness order. Default is
+\&'fast'. Internally, alpha filtering is performed using four possible
+predictions (none, horizontal, vertical, gradient). The 'best' mode will try
+each mode in turn and pick the one which gives the smaller size. The 'fast'
+mode will just try to form an a-priori guess without testing all modes.
+.TP
+.BI \-alpha_method " int
+Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
+no compression, 1 uses WebP lossless format for compression. The default is 1.
+.TP
+.B \-alpha_cleanup
+Modify unseen RGB values under fully transparent area, to help compressibility.
+The default is off.
+.TP
+.BI \-blend_alpha " int
+This option blends the alpha channel (if present) with the source using the
+background color specified in hexadecimal as 0xrrggbb. The alpha channel is
+afterward reset to the opaque value 255.
+.TP
+.B \-noalpha
+Using this option will discard the alpha channel.
+.TP
+.B \-lossless
+Encode the image without any loss.
+.TP
+.BI \-hint " string
+Specify the hint about input image type. Possible values are:
+\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBnone\fP.
+
+Note: each input format may not support all combinations.
+.TP
+.B \-noasm
+Disable all assembly optimizations.
 .TP
 .B \-v
 Print extra information (encoding time in particular).
 .TP
+.B \-print_psnr
+Compute and report average PSNR (Peak-Signal-To-Noise ratio).
+.TP
+.B \-print_ssim
+Compute and report average SSIM (structural similarity
+metric, see http://en.wikipedia.org/wiki/SSIM for additional details).
+.TP
+.B \-print_lsim
+Compute and report local similarity metric (sum of lowest error amongst the
+collocated pixel neighbors).
+.TP
+.B \-progress
+Report encoding progress in percent.
+.TP
 .B \-quiet
 Do not print anything.
 .TP
 .B \-short
 Only print brief information (output file size and PSNR) for testing purpose.

-.SH Examples:
-cwebp -q 70 picture.png -o picture.webp
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
 .br
-cwebp -sns 70 -f 50 -strong -af -size 60000 picture.png -o picture.webp
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/

-.SH
-.SH SEE ALSO
-.BR dwebp (1).
+.SH EXAMPLES
+cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
 .br
-Please refer to http://code.google.com/speed/webp/ for additional
-information.
-.SH AUTHOR
+cwebp \-q 70 picture_with_alpha.png \-o picture_with_alpha.webp
+.br
+cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
+.br
+cwebp \-o picture.webp \-\- \-\-\-picture.png
+
+.SH AUTHORS
 \fBcwebp\fP was written by the WebP team.
 .br
 The latest source tree is available at http://www.webmproject.org/code
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR dwebp (1),
+.BR gif2webp (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,7 +1,7 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "March  28, 2011"
+.TH DWEBP 1 "July 22, 2014"
 .SH NAME
-dwebp \- compress a WebP file to an image file
+dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
 .B dwebp
 .RI [ options ] " input_file.webp
@ -11,7 +11,7 @@ This manual page documents the
 .B dwebp
 command.
 .PP
-\fBdwebp\fP decompresses WebP files into PNG or PPM images.
+\fBdwebp\fP decompresses WebP files into PNG, PAM, PPM or PGM images.
 .SH OPTIONS
 The basic options are:
 .TP
@ -21,35 +21,125 @@ Print usage summary.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
-.B \-o string
+.BI \-o " string
 Specify the name of the output file (as PNG format by default).
+Using "-" as output name will direct output to 'stdout'.
+.TP
+.BI \-\- " string
+Explicitly specify the input file. This option is useful if the input
+file starts with an '\-' for instance. This option must appear \fBlast\fP.
+Any other options afterward will be ignored. If the input file is "\-",
+the data will be read from \fIstdin\fP instead of a file.
+.TP
+.B \-bmp
+Change the output format to uncompressed BMP.
+.TP
+.B \-tiff
+Change the output format to uncompressed TIFF.
+.TP
+.B \-pam
+Change the output format to PAM (retains alpha).
 .TP
 .B \-ppm
-Change the output format to PPM.
+Change the output format to PPM (discards alpha).
 .TP
 .B \-pgm
-Change the output format to PGM. The output consist of luma/chroma
-samples instead of RGB, using the ICM4 layout. This option is mainly
-for verification and debugging purpose.
+Change the output format to PGM. The output consists of luma/chroma
+samples instead of RGB, using the IMC4 layout. This option is mainly
+for verification and debugging purposes.
+.TP
+.B \-yuv
+Change the output format to raw YUV. The output consists of
+luma/chroma-U/chroma-V samples instead of RGB, saved sequentially as
+individual planes. This option is mainly for verification and debugging
+purposes.
+.TP
+.B \-nofancy
+Don't use the fancy upscaler for YUV420. This may lead to jaggy
+edges (especially the red ones), but should be faster.
+.TP
+.B \-nofilter
+Don't use the in-loop filtering process even if it is required by
+the bitstream. This may produce visible blocks on the non-compliant output,
+but it will make the decoding faster.
+.TP
+.BI \-dither " strength
+Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
+post-processing effect applied to chroma components in lossy compression.
+It helps by smoothing gradients and avoiding banding artifacts.
+.\" TODO(jzern): restore post-v0.4.1
+.\" .TP
+.\" .BI \-alpha_dither
+.\" If the compressed file contains a transparency plane that was quantized
+.\" during compression, this flag will allow dithering the reconstructed plane
+.\" in order to generate smoother transparency gradients.
+.TP
+.B \-nodither
+Disable all dithering (default).
+.TP
+.B \-mt
+Use multi-threading for decoding, if possible.
+.TP
+.BI \-crop " x_position y_position width height
+Crop the decoded picture to a rectangle with top-left corner at coordinates
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
+The top-left corner will be snapped to even coordinates if needed.
+This option is meant to reduce the memory needed for cropping large images.
+Note: the cropping is applied \fIbefore\fP any scaling.
+.\" TODO(jzern): restore post-v0.4.1
+.\" .TP
+.\" .B \-flip
+.\" Flip decoded image vertically (can be useful for OpenGL textures for instance).
+.TP
+.BI \-scale " width height
+Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This
+option is mostly intended to reducing the memory needed to decode large images,
+when only a small version is needed (thumbnail, preview, etc.).  Note: scaling
+is applied \fIafter\fP cropping.
 .TP
 .B \-v
 Print extra information (decoding time in particular).
+.TP
+.B \-noasm
+Disable all assembly optimizations.

-.SH Examples:
-dwebp picture.webp -o output.png
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
 .br
-dwebp picture.webp -ppm -o output.ppm
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/

-.SH
-.SH SEE ALSO
-.BR cwebp (1).
+.SH EXAMPLES
+dwebp picture.webp \-o output.png
 .br
-Please refer to http://code.google.com/speed/webp/ for additional
-information.
-.SH AUTHOR
+dwebp picture.webp \-ppm \-o output.ppm
+.br
+dwebp \-o output.ppm \-\- \-\-\-picture.webp
+.br
+cat picture.webp | dwebp \-o \- \-\- \- > output.ppm
+
+.SH AUTHORS
 \fBdwebp\fP was written by the WebP team.
 .br
 The latest source tree is available at http://www.webmproject.org/code
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR cwebp (1),
+.BR gif2webp (1),
+.BR webpmux (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
+.SS Output file format details
+PAM: http://netpbm.sourceforge.net/doc/pam.html
+.br
+PGM: http://netpbm.sourceforge.net/doc/pgm.html
+.br
+PPM: http://netpbm.sourceforge.net/doc/ppm.html
+.br
+PNG: http://www.libpng.org/pub/png/png-sitemap.html#info
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -0,0 +1,144 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH GIF2WEBP 1 "March 7, 2014"
+.SH NAME
+gif2webp \- Convert a GIF image to WebP
+.SH SYNOPSIS
+.B gif2webp
+.RI [ options ] " input_file.gif \-o output_file.webp
+.br
+.SH DESCRIPTION
+This manual page documents the
+.B gif2webp
+command.
+.PP
+\fBgif2webp\fP converts a GIF image to a WebP image.
+.SH OPTIONS
+The basic options are:
+.TP
+.BI \-o " string
+Specify the name of the output WebP file. If omitted, \fBgif2webp\fP will
+perform conversion but only report statistics.
+Using "\-" as output name will direct output to 'stdout'.
+.TP
+.B \-h, \-help
+Usage information.
+.TP
+.B \-version
+Print the version number (as major.minor.revision) and exit.
+.TP
+.B \-lossy
+Encode the image using lossy compression.
+.TP
+.B \-mixed
+Mixed compression mode: optimize compression of the image by picking either
+lossy or lossless compression for each frame heuristically.
+.TP
+.BI \-q " float
+Specify the compression factor for RGB channels between 0 and 100. The default
+is 75.
+.br
+In case of lossless compression (default), a small factor enables faster
+compression speed, but produces a larger file. Maximum compression is achieved
+by using a value of 100.
+.br
+In case of lossy compression (specified by the \-lossy option), a small factor
+produces a smaller file with lower quality. Best quality is achieved by using a
+value of 100.
+.TP
+.BI \-m " int
+Specify the compression method to use. This parameter controls the
+trade off between encoding speed and the compressed file size and quality.
+Possible values range from 0 to 6. Default value is 4.
+When higher values are used, the encoder will spend more time inspecting
+additional encoding possibilities and decide on the quality gain.
+Lower value can result is faster processing time at the expense of
+larger file size and lower compression quality.
+.TP
+.BI \-kmin " int
+.TP
+.BI \-kmax " int
+Specify the minimum and maximum distance between consecutive key frames
+(independently decodable frames) in the output animation. The tool will insert
+some key frames into the output animation as needed so that this criteria is
+satisfied.
+.br
+A 'kmin' value of 0 will turn off insertion of key frames.
+Typical values are in the range 3 to 30. Default values are kmin = 9,
+kmax = 17 for lossless compression and kmin = 3, kmax = 5 for lossy compression.
+.br
+These two options are relevant only for animated images with large number of
+frames (>50).
+.br
+When lower values are used, more frames will be converted to key frames. This
+may lead to smaller number of frames required to decode a frame on average,
+thereby improving the decoding performance. But this may lead to slightly bigger
+file sizes.
+Higher values may lead to worse decoding performance, but smaller file sizes.
+.br
+Some restrictions:
+.br
+(i) kmin < kmax,
+.br
+(ii) kmin >= kmax / 2 + 1 and
+.br
+(iii) kmax - kmin <= 30.
+.br
+If any of these restrictions are not met, they will be enforced automatically.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBxmp\fP.
+.TP
+.BI \-f " int
+For lossy encoding only (specified by the \-lossy option). Specify the strength
+of the deblocking filter, between 0 (no filtering) and 100 (maximum filtering).
+A value of 0 will turn off any filtering. Higher value will increase the
+strength of the filtering process applied after decoding the picture. The higher
+the value the smoother the picture will appear. Typical values are usually in
+the range of 20 to 50.
+.TP
+.B \-mt
+Use multi-threading for encoding, if possible. This option is only effective
+when using lossy compression.
+.TP
+.B \-v
+Print extra information.
+.TP
+.B \-quiet
+Do not print anything.
+
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/
+
+.SH EXAMPLES
+gif2webp picture.gif \-o picture.webp
+.br
+gif2webp \-q 70 picture.gif \-o picture.webp
+.br
+gif2webp \-lossy \-m 3 picture.gif \-o picture_lossy.webp
+.br
+gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
+.br
+gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif
+
+.SH AUTHORS
+\fBgif2webp\fP was written by the WebP team.
+.br
+The latest source tree is available at http://www.webmproject.org/code
+.PP
+This manual page was written by Urvang Joshi <urvang@google.com>, for the
+Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR cwebp (1),
+.BR dwebp (1),
+.BR webpmux (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -0,0 +1,91 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH VWEBP 1 "July 23, 2014"
+.SH NAME
+vwebp \- decompress a WebP file and display it in a window
+.SH SYNOPSIS
+.B vwebp
+.RI [ options ] " input_file.webp
+.br
+.SH DESCRIPTION
+This manual page documents the
+.B vwebp
+command.
+.PP
+\fBvwebp\fP decompresses a WebP file and displays it in a window using OpenGL.
+.SH OPTIONS
+.TP
+.B \-h
+Print usage summary.
+.TP
+.B \-version
+Print version number and exit.
+.TP
+.B \-noicc
+Don't use the ICC profile if present.
+.TP
+.B \-nofancy
+Don't use the fancy YUV420 upscaler.
+.TP
+.B \-nofilter
+Disable in-loop filtering.
+.TP
+.BI \-dither " strength
+Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
+post-processing effect applied to chroma components in lossy compression.
+It helps by smoothing gradients and avoiding banding artifacts. Default: 50.
+.\" TODO(jzern): restore post-v0.4.1
+.\" .TP
+.\" .BI \-noalphadither
+.\" By default, quantized transparency planes are dithered during decompression,
+.\" to smooth the gradients. This flag will prevent this dithering.
+.TP
+.B \-mt
+Use multi-threading for decoding, if possible.
+.TP
+.B \-info
+Display image information on top of the decoded image.
+.TP
+.BI \-\- " string
+Explicitly specify the input file. This option is useful if the input
+file starts with an '\-' for instance. This option must appear \fBlast\fP.
+Any other options afterward will be ignored. If the input file is "\-",
+the data will be read from \fIstdin\fP instead of a file.
+.TP
+
+.SH KEYBOARD SHORTCUTS
+.TP
+.B 'c'
+Toggle use of color profile.
+.TP
+.B 'i'
+Overlay file information.
+.TP
+.B 'q' / 'Q' / ESC
+Quit.
+
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/
+
+.SH EXAMPLES
+vwebp picture.webp
+.br
+vwebp picture.webp -mt -dither 0
+.br
+vwebp \-\- \-\-\-picture.webp
+
+.SH AUTHORS
+\fBvwebp\fP was written by the WebP team.
+.br
+The latest source tree is available at http://www.webmproject.org/code
+.PP
+This manual page was written for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR dwebp (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -0,0 +1,211 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH WEBPMUX 1 "August 28, 2014"
+.SH NAME
+webpmux \- create animated WebP files from non\-animated WebP images, extract
+frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
+.SH SYNOPSIS
+.B webpmux \-get
+.I GET_OPTIONS
+.I INPUT
+.B \-o
+.I OUTPUT
+.br
+.B webpmux \-set
+.I SET_OPTIONS
+.I INPUT
+.B \-o
+.I OUTPUT
+.br
+.B webpmux \-strip
+.I STRIP_OPTIONS
+.I INPUT
+.B \-o
+.I OUTPUT
+.br
+.B webpmux \-frame
+.I FRAME_OPTIONS
+.B [ \-frame ... ] [ \-loop
+.I LOOP_COUNT
+.B ]
+.br
+.RS 8
+.B [ \-bgcolor
+.I BACKGROUND_COLOR
+.B ] \-o
+.I OUTPUT
+.RE
+.br
+.B webpmux \-info
+.I INPUT
+.br
+.B webpmux [\-h|\-help]
+.br
+.B webpmux \-version
+.SH DESCRIPTION
+This manual page documents the
+.B webpmux
+command.
+.PP
+\fBwebpmux\fP can be used to create/extract from animated WebP files, as well as
+to add/extract/strip XMP/EXIF metadata and ICC profile.
+.SH OPTIONS
+.SS GET_OPTIONS (\-get):
+.TP
+.B icc
+Get ICC profile.
+.TP
+.B exif
+Get EXIF metadata.
+.TP
+.B xmp
+Get XMP metadata.
+.TP
+.BI frame " n
+Get nth frame from an animated image. (n = 0 has a special meaning: last frame).
+
+.SS SET_OPTIONS (\-set)
+.TP
+.BI icc " file.icc
+Set ICC profile.
+.P
+Where: 'file.icc' contains the ICC profile to be set.
+.TP
+.BI exif " file.exif
+Set EXIF metadata.
+.P
+Where: 'file.exif' contains the EXIF metadata to be set.
+.TP
+.BI xmp " file.xmp
+Set XMP metadata.
+.P
+Where: 'file.xmp' contains the XMP metadata to be set.
+
+.SS STRIP_OPTIONS (\-strip)
+.TP
+.B icc
+Strip ICC profile.
+.TP
+.B exif
+Strip EXIF metadata.
+.TP
+.B xmp
+Strip XMP metadata.
+
+.SS FRAME_OPTIONS (\-frame)
+Create an animated WebP file from multiple (non\-animated) WebP images.
+.TP
+.I file_i +di[+xi+yi[+mi[bi]]]
+Where: 'file_i' is the i'th frame (WebP format), 'xi','yi' specify the image
+offset for this frame, 'di' is the pause duration before next frame, 'mi' is
+the dispose method for this frame (0 for NONE or 1 for BACKGROUND) and 'bi' is
+the blending method for this frame (+b for BLEND or \-b for NO_BLEND).
+Argument 'bi' can be omitted and will default to +b (BLEND).
+Also, 'mi' can be omitted if 'bi' is omitted and will default to 0 (NONE).
+Finally, if 'mi' and 'bi' are omitted then 'xi' and 'yi' can be omitted and will
+default to +0+0.
+.TP
+.BI \-loop " n
+Loop the frames n number of times. 0 indicates the frames should loop forever.
+Valid range is 0 to 65535 [Default: 0 (infinite)].
+.TP
+.BI \-bgcolor " A,R,G,B
+Background color of the canvas.
+.br
+where: 'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying the
+Alpha, Red, Green and Blue component values respectively
+[Default: 255,255,255,255].
+
+.SS INPUT
+.TP
+Input file in WebP format.
+
+.SS OUTPUT (\-o)
+.TP
+Output file in WebP format.
+
+.SS Note:
+.TP
+The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.
+
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting\-patches/
+
+.SH EXAMPLES
+.P
+Add ICC profile:
+.br
+webpmux \-set icc image_profile.icc in.webp \-o icc_container.webp
+.P
+Extract ICC profile:
+.br
+webpmux \-get icc icc_container.webp \-o image_profile.icc
+.P
+Strip ICC profile:
+.br
+webpmux \-strip icc icc_container.webp \-o without_icc.webp
+.P
+Add XMP metadata:
+.br
+webpmux \-set xmp image_metadata.xmp in.webp \-o xmp_container.webp
+.P
+Extract XMP metadata:
+.br
+webpmux \-get xmp xmp_container.webp \-o image_metadata.xmp
+.P
+Strip XMP metadata:
+.br
+webpmux \-strip xmp xmp_container.webp \-o without_xmp.webp
+.P
+Add EXIF metadata:
+.br
+webpmux \-set exif image_metadata.exif in.webp \-o exif_container.webp
+.P
+Extract EXIF metadata:
+.br
+webpmux \-get exif exif_container.webp \-o image_metadata.exif
+.P
+Strip EXIF metadata:
+.br
+webpmux \-strip exif exif_container.webp \-o without_exif.webp
+.P
+Create an animated WebP file from 3 (non\-animated) WebP images:
+.br
+webpmux \-frame 1.webp +100 \-frame 2.webp +100+50+50
+.br
+.RS 8
+\-frame 3.webp +100+50+50+1+b \-loop 10 \-bgcolor 255,255,255,255
+.br
+\-o anim_container.webp
+.RE
+.P
+Get the 2nd frame from an animated WebP file:
+.br
+webpmux \-get frame 2 anim_container.webp \-o frame_2.webp
+.P
+Using \-get/\-set/\-strip with input file name starting with '\-':
+.br
+webpmux \-set icc image_profile.icc \-o icc_container.webp \-\- \-\-\-in.webp
+.br
+webpmux \-get icc \-o image_profile.icc \-\- \-\-\-icc_container.webp
+.br
+webpmux \-strip icc \-o without_icc.webp \-\- \-\-\-icc_container.webp
+
+.SH AUTHORS
+\fBwebpmux\fP is written by the WebP team.
+.br
+The latest source tree is available at http://www.webmproject.org/code
+.PP
+This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
+for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR cwebp (1),
+.BR dwebp (1),
+.BR gif2webp (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/src/.gitignore
+++ b/src/.gitignore
@ -1 +0,0 @@
-/*.pc
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -1,16 +1,54 @@
-SUBDIRS = dec enc
+# The mux and demux libraries depend on libwebp, thus the '.' to force the
+# build order so it's available to them.
+SUBDIRS = dec enc dsp utils .
+if WANT_MUX
+  SUBDIRS += mux
+endif
+if WANT_DEMUX
+  SUBDIRS += demux
+endif

-AM_CPPFLAGS = -I$(top_srcdir)/src
 lib_LTLIBRARIES = libwebp.la

-libwebp_la_SOURCES =
-libwebp_la_LIBADD = dec/libwebpdecode.la \
-                    enc/libwebpencode.la
-libwebp_la_LDFLAGS = -version-info 0:0:0
-libwebpinclude_HEADERS = webp/types.h webp/decode.h webp/decode_vp8.h \
-                         webp/encode.h
-libwebpincludedir = $(includedir)/webp
+if BUILD_LIBWEBPDECODER
+  lib_LTLIBRARIES += libwebpdecoder.la
+endif

+common_HEADERS =
+common_HEADERS += webp/decode.h
+common_HEADERS += webp/types.h
+commondir = $(includedir)/webp
+
+libwebp_la_SOURCES =
+libwebpinclude_HEADERS =
+libwebpinclude_HEADERS += webp/encode.h
+noinst_HEADERS =
+noinst_HEADERS += webp/format_constants.h
+
+libwebp_la_LIBADD =
+libwebp_la_LIBADD += dec/libwebpdecode.la
+libwebp_la_LIBADD += dsp/libwebpdsp.la
+libwebp_la_LIBADD += enc/libwebpencode.la
+libwebp_la_LIBADD += utils/libwebputils.la
+
+# Use '-no-undefined' to declare that libwebp does not depend on any libraries
+# other than the ones listed on the command line, i.e., after linking, it will
+# not have unresolved symbols. Some platforms (Windows among them) require all
+# symbols in shared libraries to be resolved at library creation.
+libwebp_la_LDFLAGS = -no-undefined -version-info 5:2:0
+libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

+if BUILD_LIBWEBPDECODER
+  libwebpdecoder_la_SOURCES =
+
+  libwebpdecoder_la_LIBADD =
+  libwebpdecoder_la_LIBADD += dec/libwebpdecode.la
+  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
+  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la
+
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:2:0
+  pkgconfig_DATA += libwebpdecoder.pc
+endif
+
 ${pkgconfig_DATA}: ${top_builddir}/config.status
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -1,14 +1,27 @@
-AM_CPPFLAGS = -I$(top_srcdir)/src
-
-libwebpdecode_la_SOURCES = bits.h vp8i.h yuv.h bits.c dsp.c frame.c \
-                          quant.c tree.c vp8.c webp.c yuv.c idec.c
-libwebpdecode_la_LDFLAGS = -version-info 0:0:0
-libwebpdecodeinclude_HEADERS = ../webp/decode.h ../webp/decode_vp8.h ../webp/types.h
-libwebpdecodeincludedir = $(includedir)/webp
-
-noinst_HEADERS = bits.h vp8i.h webpi.h yuv.h
-
 noinst_LTLIBRARIES = libwebpdecode.la
-# uncomment the following line (and comment the above) if you want
-# to install libwebpdecode library.
-#lib_LTLIBRARIES = libwebpdecode.la
+
+libwebpdecode_la_SOURCES =
+libwebpdecode_la_SOURCES += alpha.c
+libwebpdecode_la_SOURCES += alphai.h
+libwebpdecode_la_SOURCES += buffer.c
+libwebpdecode_la_SOURCES += decode_vp8.h
+libwebpdecode_la_SOURCES += frame.c
+libwebpdecode_la_SOURCES += idec.c
+libwebpdecode_la_SOURCES += io.c
+libwebpdecode_la_SOURCES += quant.c
+libwebpdecode_la_SOURCES += tree.c
+libwebpdecode_la_SOURCES += vp8.c
+libwebpdecode_la_SOURCES += vp8i.h
+libwebpdecode_la_SOURCES += vp8l.c
+libwebpdecode_la_SOURCES += vp8li.h
+libwebpdecode_la_SOURCES += webp.c
+libwebpdecode_la_SOURCES += webpi.h
+
+libwebpdecodeinclude_HEADERS =
+libwebpdecodeinclude_HEADERS += ../webp/decode.h
+libwebpdecodeinclude_HEADERS += ../webp/types.h
+noinst_HEADERS =
+noinst_HEADERS += ../webp/format_constants.h
+
+libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebpdecodeincludedir = $(includedir)/webp
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@ -0,0 +1,165 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane decompression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./alphai.h"
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "../utils/quant_levels_dec.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+//------------------------------------------------------------------------------
+// ALPHDecoder object.
+
+ALPHDecoder* ALPHNew(void) {
+  ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
+  return dec;
+}
+
+void ALPHDelete(ALPHDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LDelete(dec->vp8l_dec_);
+    dec->vp8l_dec_ = NULL;
+    WebPSafeFree(dec);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Decoding.
+
+// Initialize alpha decoding by parsing the alpha header and decoding the image
+// header for alpha data stored using lossless compression.
+// Returns false in case of error in alpha header (data too short, invalid
+// compression method or filter, error in lossless header data etc).
+static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                    size_t data_size, int width, int height, uint8_t* output) {
+  int ok = 0;
+  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
+  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+  int rsrv;
+
+  assert(width > 0 && height > 0);
+  assert(data != NULL && output != NULL);
+
+  dec->width_ = width;
+  dec->height_ = height;
+
+  if (data_size <= ALPHA_HEADER_LEN) {
+    return 0;
+  }
+
+  dec->method_ = (data[0] >> 0) & 0x03;
+  dec->filter_ = (data[0] >> 2) & 0x03;
+  dec->pre_processing_ = (data[0] >> 4) & 0x03;
+  rsrv = (data[0] >> 6) & 0x03;
+  if (dec->method_ < ALPHA_NO_COMPRESSION ||
+      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
+      dec->filter_ >= WEBP_FILTER_LAST ||
+      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
+      rsrv != 0) {
+    return 0;
+  }
+
+  if (dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = dec->width_ * dec->height_;
+    ok = (alpha_data_size >= alpha_decoded_size);
+  } else {
+    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
+  }
+  return ok;
+}
+
+// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
+// starting from row number 'row'. It assumes that rows up to (row - 1) have
+// already been decoded.
+// Returns false in case of bitstream error.
+static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+  ALPHDecoder* const alph_dec = dec->alph_dec_;
+  const int width = alph_dec->width_;
+  const int height = alph_dec->height_;
+  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
+  uint8_t* const output = dec->alpha_plane_;
+  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t offset = row * width;
+    const size_t num_pixels = num_rows * width;
+    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
+    memcpy(dec->alpha_plane_ + offset,
+           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
+  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
+    assert(alph_dec->vp8l_dec_ != NULL);
+    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
+      return 0;
+    }
+  }
+
+  if (unfilter_func != NULL) {
+    unfilter_func(width, height, width, row, num_rows, output);
+  }
+
+  if (row + num_rows == dec->pic_hdr_.height_) {
+    dec->is_alpha_decoded_ = 1;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point.
+
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows) {
+  const int width = dec->pic_hdr_.width_;
+  const int height = dec->pic_hdr_.height_;
+
+  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
+    return NULL;    // sanity check.
+  }
+
+  if (row == 0) {
+    // Initialize decoding.
+    assert(dec->alpha_plane_ != NULL);
+    dec->alph_dec_ = ALPHNew();
+    if (dec->alph_dec_ == NULL) return NULL;
+    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
+                  width, height, dec->alpha_plane_)) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+      return NULL;
+    }
+    // if we allowed use of alpha dithering, check whether it's needed at all
+    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
+      dec->alpha_dithering_ = 0;  // disable dithering
+    } else {
+      num_rows = height;          // decode everything in one pass
+    }
+  }
+
+  if (!dec->is_alpha_decoded_) {
+    int ok = 0;
+    assert(dec->alph_dec_ != NULL);
+    ok = ALPHDecode(dec, row, num_rows);
+    if (ok && dec->alpha_dithering_ > 0) {
+      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
+                                dec->alpha_dithering_);
+    }
+    if (!ok || dec->is_alpha_decoded_) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+    }
+    if (!ok) return NULL;  // Error.
+  }
+
+  // Return a pointer to the current decoded row.
+  return dec->alpha_plane_ + row * width;
+}
--- a/src/dec/alphai.h
+++ b/src/dec/alphai.h
@ -0,0 +1,55 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha decoder: internal header.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_DEC_ALPHAI_H_
+#define WEBP_DEC_ALPHAI_H_
+
+#include "./webpi.h"
+#include "../utils/filters.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8LDecoder;  // Defined in dec/vp8li.h.
+
+typedef struct ALPHDecoder ALPHDecoder;
+struct ALPHDecoder {
+  int width_;
+  int height_;
+  int method_;
+  WEBP_FILTER_TYPE filter_;
+  int pre_processing_;
+  struct VP8LDecoder* vp8l_dec_;
+  VP8Io io_;
+  int use_8b_decode;  // Although alpha channel requires only 1 byte per
+                      // pixel, sometimes VP8LDecoder may need to allocate
+                      // 4 bytes per pixel internally during decode.
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// Allocates a new alpha decoder instance.
+ALPHDecoder* ALPHNew(void);
+
+// Clears and deallocates an alpha decoder instance.
+void ALPHDelete(ALPHDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_ALPHAI_H_ */
--- a/src/dec/bits.c
+++ b/src/dec/bits.c
@ -1,79 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Boolean decoder
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "bits.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// VP8BitReader
-
-void VP8InitBitReader(VP8BitReader* const br,
-                      const uint8_t* const start, const uint8_t* const end) {
-  assert(br);
-  assert(start);
-  assert(start <= end);
-  br->range_   = 255 - 1;
-  br->buf_     = start;
-  br->buf_end_ = end;
-  br->value_   = 0;
-  br->missing_ = 8;
-  br->eof_     = 0;
-}
-
-const uint8_t kVP8Log2Range[128] = {
-     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0
-};
-
-// range = ((range + 1) << kVP8Log2Range[range]) - 1
-const uint8_t kVP8NewRange[128] = {
-  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
-  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
-  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
-  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
-  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
-  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
-  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
-  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
-  241, 243, 245, 247, 249, 251, 253, 127
-};
-
-//-----------------------------------------------------------------------------
-// Higher-level calls
-
-uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
-  uint32_t v = 0;
-  while (bits-- > 0) {
-    v |= VP8GetBit(br, 0x80) << bits;
-  }
-  return v;
-}
-
-int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
-  const int value = VP8GetValue(br, bits);
-  return VP8Get(br) ? -value : value;
-}
-
-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/bits.h
+++ b/src/dec/bits.h
@ -1,107 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Boolean decoder
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_DEC_BITS_H_
-#define WEBP_DEC_BITS_H_
-
-#include <assert.h>
-#include "webp/decode_vp8.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// Bitreader and code-tree reader
-
-typedef struct {
-  const uint8_t* buf_;        // next byte to be read
-  const uint8_t* buf_end_;    // end of read buffer
-  int eof_;                   // true if input is exhausted
-
-  // boolean decoder
-  uint32_t range_;            // current range minus 1. In [127, 254] interval.
-  uint32_t value_;            // current value
-  int missing_;               // number of missing bits in value_ (8bit)
-} VP8BitReader;
-
-// Initialize the bit reader and the boolean decoder.
-void VP8InitBitReader(VP8BitReader* const br,
-                      const uint8_t* const start, const uint8_t* const end);
-
-// return the next value made of 'num_bits' bits
-uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
-static inline uint32_t VP8Get(VP8BitReader* const br) {
-  return VP8GetValue(br, 1);
-}
-
-// return the next value with sign-extension.
-int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
-
-// Read a bit with proba 'prob'. Speed-critical function!
-extern const uint8_t kVP8Log2Range[128];
-extern const uint8_t kVP8NewRange[128];
-static inline uint32_t VP8GetByte(VP8BitReader* const br) {
-  assert(br);
-  if (br->buf_ < br->buf_end_) {
-    assert(br->buf_);
-    return *br->buf_++;
-  }
-  br->eof_ = 1;
-  return 0xff;
-}
-
-static inline uint32_t VP8BitUpdate(VP8BitReader* const br, uint32_t split) {
-  uint32_t bit;
-  // Make sure we have a least 8 bits in 'value_'
-  if (br->missing_ > 0) {
-    br->value_ |= VP8GetByte(br) << br->missing_;
-    br->missing_ -= 8;
-  }
-  bit = ((br->value_ >> 8) > split);
-  if (bit) {
-    br->range_ -= split + 1;
-    br->value_ -= (split + 1) << 8;
-  } else {
-    br->range_ = split;
-  }
-  return bit;
-}
-
-static inline void VP8Shift(VP8BitReader* const br) {
-  // range_ is in [0..127] interval here.
-  const int shift = kVP8Log2Range[br->range_];
-  br->range_ = kVP8NewRange[br->range_];
-  br->value_ <<= shift;
-  br->missing_ += shift;
-}
-
-static inline uint32_t VP8GetBit(VP8BitReader* const br, int prob) {
-  const uint32_t split = (br->range_ * prob) >> 8;
-  const uint32_t bit = VP8BitUpdate(br, split);
-  if (br->range_ < 0x7f) {
-    VP8Shift(br);
-  }
-  return bit;
-}
-
-static inline int VP8GetSigned(VP8BitReader* const br, int v) {
-  const uint32_t split = br->range_ >> 1;
-  const uint32_t bit = VP8BitUpdate(br, split);
-  VP8Shift(br);
-  return bit ? -v : v;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif  // WEBP_DEC_BITS_H_
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -0,0 +1,251 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Everything about WebPDecBuffer
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./vp8i.h"
+#include "./webpi.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// WebPDecBuffer
+
+// Number of bytes per pixel for the different color-spaces.
+static const int kModeBpp[MODE_LAST] = {
+  3, 4, 3, 4, 4, 2, 2,
+  4, 4, 4, 2,    // pre-multiplied modes
+  1, 1 };
+
+// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
+// Convert to an integer to handle both the unsigned/signed enum cases
+// without the need for casting to remove type limit warnings.
+static int IsValidColorspace(int webp_csp_mode) {
+  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
+}
+
+static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
+  int ok = 1;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
+  const int width = buffer->width;
+  const int height = buffer->height;
+  if (!IsValidColorspace(mode)) {
+    ok = 0;
+  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
+    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int y_stride = abs(buf->y_stride);
+    const int u_stride = abs(buf->u_stride);
+    const int v_stride = abs(buf->v_stride);
+    const int a_stride = abs(buf->a_stride);
+    const uint64_t y_size = (uint64_t)y_stride * height;
+    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
+    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
+    const uint64_t a_size = (uint64_t)a_stride * height;
+    ok &= (y_size <= buf->y_size);
+    ok &= (u_size <= buf->u_size);
+    ok &= (v_size <= buf->v_size);
+    ok &= (y_stride >= width);
+    ok &= (u_stride >= (width + 1) / 2);
+    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
+      ok &= (a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
+    }
+  } else {    // RGB checks
+    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    const int stride = abs(buf->stride);
+    const uint64_t size = (uint64_t)stride * height;
+    ok &= (size <= buf->size);
+    ok &= (stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
+  }
+  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
+}
+
+static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
+  const int w = buffer->width;
+  const int h = buffer->height;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
+
+  if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
+    uint8_t* output;
+    int uv_stride = 0, a_stride = 0;
+    uint64_t uv_size = 0, a_size = 0, total_size;
+    // We need memory and it hasn't been allocated yet.
+    // => initialize output buffer, now that dimensions are known.
+    const int stride = w * kModeBpp[mode];
+    const uint64_t size = (uint64_t)stride * h;
+
+    if (!WebPIsRGBMode(mode)) {
+      uv_stride = (w + 1) / 2;
+      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
+      if (mode == MODE_YUVA) {
+        a_stride = w;
+        a_size = (uint64_t)a_stride * h;
+      }
+    }
+    total_size = size + 2 * uv_size + a_size;
+
+    // Security/sanity checks
+    output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
+    if (output == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    buffer->private_memory = output;
+
+    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
+      WebPYUVABuffer* const buf = &buffer->u.YUVA;
+      buf->y = output;
+      buf->y_stride = stride;
+      buf->y_size = (size_t)size;
+      buf->u = output + size;
+      buf->u_stride = uv_stride;
+      buf->u_size = (size_t)uv_size;
+      buf->v = output + size + uv_size;
+      buf->v_stride = uv_stride;
+      buf->v_size = (size_t)uv_size;
+      if (mode == MODE_YUVA) {
+        buf->a = output + size + 2 * uv_size;
+      }
+      buf->a_size = (size_t)a_size;
+      buf->a_stride = a_stride;
+    } else {  // RGBA initialization
+      WebPRGBABuffer* const buf = &buffer->u.RGBA;
+      buf->rgba = output;
+      buf->stride = stride;
+      buf->size = (size_t)size;
+    }
+  }
+  return CheckDecBuffer(buffer);
+}
+
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
+  if (buffer == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (WebPIsRGBMode(buffer->colorspace)) {
+    WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->stride = -buf->stride;
+  } else {
+    WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int H = buffer->height;
+    buf->y += (H - 1) * buf->y_stride;
+    buf->y_stride = -buf->y_stride;
+    buf->u += ((H - 1) >> 1) * buf->u_stride;
+    buf->u_stride = -buf->u_stride;
+    buf->v += ((H - 1) >> 1) * buf->v_stride;
+    buf->v_stride = -buf->v_stride;
+    if (buf->a != NULL) {
+      buf->a += (H - 1) * buf->a_stride;
+      buf->a_stride = -buf->a_stride;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const out) {
+  VP8StatusCode status;
+  if (out == NULL || w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (options != NULL) {    // First, apply options if there is any.
+    if (options->use_cropping) {
+      const int cw = options->crop_width;
+      const int ch = options->crop_height;
+      const int x = options->crop_left & ~1;
+      const int y = options->crop_top & ~1;
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
+      }
+      w = cw;
+      h = ch;
+    }
+    if (options->use_scaling) {
+      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+        return VP8_STATUS_INVALID_PARAM;
+      }
+      w = options->scaled_width;
+      h = options->scaled_height;
+    }
+  }
+  out->width = w;
+  out->height = h;
+
+  // Then, allocate buffer for real.
+  status = AllocateBuffer(out);
+  if (status != VP8_STATUS_OK) return status;
+
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+  // Use the stride trick if vertical flip is needed.
+  if (options != NULL && options->flip) {
+    status = WebPFlipBuffer(out);
+  }
+#endif
+  return status;
+}
+
+//------------------------------------------------------------------------------
+// constructors / destructors
+
+int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // version mismatch
+  }
+  if (buffer == NULL) return 0;
+  memset(buffer, 0, sizeof(*buffer));
+  return 1;
+}
+
+void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
+  if (buffer != NULL) {
+    if (!buffer->is_external_memory) {
+      WebPSafeFree(buffer->private_memory);
+    }
+    buffer->private_memory = NULL;
+  }
+}
+
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst) {
+  if (src != NULL && dst != NULL) {
+    *dst = *src;
+    if (src->private_memory != NULL) {
+      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
+      dst->private_memory = NULL;
+    }
+  }
+}
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
+  if (src != NULL && dst != NULL) {
+    *dst = *src;
+    if (src->private_memory != NULL) {
+      src->is_external_memory = 1;   // src relinquishes ownership
+      src->private_memory = NULL;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
--- a/src/dec/decode_vp8.h
+++ b/src/dec/decode_vp8.h
@ -0,0 +1,185 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Low-level API for VP8 decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_WEBP_DECODE_VP8_H_
+#define WEBP_WEBP_DECODE_VP8_H_
+
+#include "../webp/decode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Lower-level API
+//
+// These functions provide fine-grained control of the decoding process.
+// The call flow should resemble:
+//
+//   VP8Io io;
+//   VP8InitIo(&io);
+//   io.data = data;
+//   io.data_size = size;
+//   /* customize io's functions (setup()/put()/teardown()) if needed. */
+//
+//   VP8Decoder* dec = VP8New();
+//   bool ok = VP8Decode(dec);
+//   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
+//   VP8Delete(dec);
+//   return ok;
+
+// Input / Output
+typedef struct VP8Io VP8Io;
+typedef int (*VP8IoPutHook)(const VP8Io* io);
+typedef int (*VP8IoSetupHook)(VP8Io* io);
+typedef void (*VP8IoTeardownHook)(const VP8Io* io);
+
+struct VP8Io {
+  // set by VP8GetHeaders()
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.
+
+  // set before calling put()
+  int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
+  int mb_h;                  // number of rows in the sample
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
+  int y_stride;              // row stride for luma
+  int uv_stride;             // row stride for chroma
+
+  void* opaque;              // user data
+
+  // called when fresh samples are available. Currently, samples are in
+  // YUV420 format, and can be up to width x 24 in size (depending on the
+  // in-loop filtering level, e.g.). Should return false in case of error
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
+  VP8IoPutHook put;
+
+  // called just before starting to decode the blocks.
+  // Must return false in case of setup error, true otherwise. If false is
+  // returned, teardown() will NOT be called. But if the setup succeeded
+  // and true is returned, then teardown() will always be called afterward.
+  VP8IoSetupHook setup;
+
+  // Called just after block decoding is finished (or when an error occurred
+  // during put()). Is NOT called if setup() failed.
+  VP8IoTeardownHook teardown;
+
+  // this is a recommendation for the user-side yuv->rgb converter. This flag
+  // is set when calling setup() hook and can be overwritten by it. It then
+  // can be taken into consideration during the put() method.
+  int fancy_upsampling;
+
+  // Input buffer.
+  size_t data_size;
+  const uint8_t* data;
+
+  // If true, in-loop filtering will not be performed even if present in the
+  // bitstream. Switching off filtering may speed up decoding at the expense
+  // of more visible blocking. Note that output will also be non-compliant
+  // with the VP8 specifications.
+  int bypass_filtering;
+
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
+  // If non NULL, pointer to the alpha data (if present) corresponding to the
+  // start of the current row (That is: it is pre-offset by mb_y and takes
+  // cropping into account).
+  const uint8_t* a;
+};
+
+// Internal, version-checked, entry point
+int VP8InitIoInternal(VP8Io* const, int);
+
+// Set the custom IO function pointers and user-data. The setter for IO hooks
+// should be called before initiating incremental decoding. Returns true if
+// WebPIDecoder object is successfully modified, false otherwise.
+int WebPISetIOHooks(WebPIDecoder* const idec,
+                    VP8IoPutHook put,
+                    VP8IoSetupHook setup,
+                    VP8IoTeardownHook teardown,
+                    void* user_data);
+
+// Main decoding object. This is an opaque structure.
+typedef struct VP8Decoder VP8Decoder;
+
+// Create a new decoder object.
+VP8Decoder* VP8New(void);
+
+// Must be called to make sure 'io' is initialized properly.
+// Returns false in case of version mismatch. Upon such failure, no other
+// decoding function should be called (VP8Decode, VP8GetHeaders, ...)
+static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
+  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
+}
+
+// Decode the VP8 frame header. Returns true if ok.
+// Note: 'io->data' must be pointing to the start of the VP8 frame header.
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+
+// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
+// Returns false in case of error.
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+
+// Return current status of the decoder:
+VP8StatusCode VP8Status(VP8Decoder* const dec);
+
+// return readable string corresponding to the last status.
+const char* VP8StatusMessage(VP8Decoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Not a mandatory call between calls to VP8Decode().
+void VP8Clear(VP8Decoder* const dec);
+
+// Destroy the decoder object.
+void VP8Delete(VP8Decoder* const dec);
+
+//------------------------------------------------------------------------------
+// Miscellaneous VP8/VP8L bitstream probing functions.
+
+// Returns true if the next 3 bytes in data contain the VP8 signature.
+WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+
+// Validates the VP8 data-header and retrieves basic header information viz
+// width and height. Returns 0 in case of formatting error. *width/*height
+// can be passed NULL.
+WEBP_EXTERN(int) VP8GetInfo(
+    const uint8_t* data,
+    size_t data_size,    // data available so far
+    size_t chunk_size,   // total data size expected in the chunk
+    int* const width, int* const height);
+
+// Returns true if the next byte(s) in data is a VP8L signature.
+WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+
+// Validates the VP8L data-header and retrieves basic header information viz
+// width, height and alpha. Returns 0 in case of formatting error.
+// width/height/has_alpha can be passed NULL.
+WEBP_EXTERN(int) VP8LGetInfo(
+    const uint8_t* data, size_t data_size,  // data available so far
+    int* const width, int* const height, int* const has_alpha);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -0,0 +1,648 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// functions for sample output.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "../dec/vp8i.h"
+#include "./webpi.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// Main YUV<->RGB conversion functions
+
+static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  const int uv_w = (mb_w + 1) / 2;
+  const int uv_h = (mb_h + 1) / 2;
+  int j;
+  for (j = 0; j < mb_h; ++j) {
+    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
+  }
+  for (j = 0; j < uv_h; ++j) {
+    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
+    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
+  }
+  return io->mb_h;
+}
+
+// Point-sampling U/V sampler.
+static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* const output = p->output;
+  WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  WebPSamplerProcessPlane(io->y, io->y_stride,
+                          io->u, io->v, io->uv_stride,
+                          dst, buf->stride, io->mb_w, io->mb_h,
+                          WebPSamplers[output->colorspace]);
+  return io->mb_h;
+}
+
+//------------------------------------------------------------------------------
+// YUV444 -> RGB conversion
+
+#if 0   // TODO(skal): this is for future rescaling.
+static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h;
+  int j;
+  for (j = 0; j < last; ++j) {
+    convert(y_src, u_src, v_src, dst, mb_w);
+    y_src += io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += buf->stride;
+  }
+  return io->mb_h;
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampling
+
+#ifdef FANCY_UPSAMPLING
+static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
+  int num_lines_out = io->mb_h;   // a priori guess
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
+  const uint8_t* cur_y = io->y;
+  const uint8_t* cur_u = io->u;
+  const uint8_t* cur_v = io->v;
+  const uint8_t* top_u = p->tmp_u;
+  const uint8_t* top_v = p->tmp_v;
+  int y = io->mb_y;
+  const int y_end = io->mb_y + io->mb_h;
+  const int mb_w = io->mb_w;
+  const int uv_w = (mb_w + 1) / 2;
+
+  if (y == 0) {
+    // First line is special cased. We mirror the u/v samples at boundary.
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
+  } else {
+    // We can finish the left-over line from previous call.
+    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+    ++num_lines_out;
+  }
+  // Loop over each output pairs of row.
+  for (; y + 2 < y_end; y += 2) {
+    top_u = cur_u;
+    top_v = cur_v;
+    cur_u += io->uv_stride;
+    cur_v += io->uv_stride;
+    dst += 2 * buf->stride;
+    cur_y += 2 * io->y_stride;
+    upsample(cur_y - io->y_stride, cur_y,
+             top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+  }
+  // move to last row
+  cur_y += io->y_stride;
+  if (io->crop_top + y_end < io->crop_bottom) {
+    // Save the unfinished samples for next call (as we're not done yet).
+    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
+    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
+    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
+    // The fancy upsampler leaves a row unfinished behind
+    // (except for the very last row)
+    num_lines_out--;
+  } else {
+    // Process the very last row of even-sized picture
+    if (!(y_end & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+               dst + buf->stride, NULL, mb_w);
+    }
+  }
+  return num_lines_out;
+}
+
+#endif    /* FANCY_UPSAMPLING */
+
+//------------------------------------------------------------------------------
+
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  int j;
+
+  if (alpha != NULL) {
+    for (j = 0; j < mb_h; ++j) {
+      memcpy(dst, alpha, mb_w * sizeof(*dst));
+      alpha += io->width;
+      dst += buf->a_stride;
+    }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    for (j = 0; j < mb_h; ++j) {
+      memset(dst, 0xff, mb_w * sizeof(*dst));
+      dst += buf->a_stride;
+    }
+  }
+  return 0;
+}
+
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaining rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const int alpha_first =
+        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+    uint32_t alpha_mask = 0xff;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        const uint32_t alpha_value = alpha[i];
+        dst[4 * i] = alpha_value;
+        alpha_mask &= alpha_value;
+      }
+      alpha += io->width;
+      dst += buf->stride;
+    }
+    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                             mb_w, num_rows, buf->stride);
+    }
+  }
+  return 0;
+}
+
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+#ifdef WEBP_SWAP_16BIT_CSP
+    uint8_t* alpha_dst = base_rgba;
+#else
+    uint8_t* alpha_dst = base_rgba + 1;
+#endif
+    uint32_t alpha_mask = 0x0f;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        // Fill in the alpha value (converted to 4 bits).
+        const uint32_t alpha_value = alpha[i] >> 4;
+        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+        alpha_mask &= alpha_value;
+      }
+      alpha += io->width;
+      alpha_dst += buf->stride;
+    }
+    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
+    }
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// YUV rescaling (no final RGB conversion needed)
+
+static int Rescale(const uint8_t* src, int src_stride,
+                   int new_lines, WebPRescaler* const wrk) {
+  int num_lines_out = 0;
+  while (new_lines > 0) {    // import new contributions of source rows.
+    const int lines_in = WebPRescalerImport(wrk, new_lines, src, src_stride);
+    src += lines_in * src_stride;
+    new_lines -= lines_in;
+    num_lines_out += WebPRescalerExport(wrk);    // emit output row(s)
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  WebPRescaler* const scaler = &p->scaler_y;
+  int num_lines_out = 0;
+  if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
+    // Before rescaling, we premultiply the luma directly into the io->y
+    // internal buffer. This is OK since these samples are not used for
+    // intra-prediction (the top samples are saved in cache_y_/u_/v_).
+    // But we need to cast the const away, though.
+    WebPMultRows((uint8_t*)io->y, io->y_stride,
+                 io->a, io->width, io->mb_w, mb_h, 0);
+  }
+  num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
+  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a != NULL) {
+    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
+    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
+    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    if (num_lines_out > 0) {   // unmultiply the Y
+      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
+                   p->scaler_a.dst_width, num_lines_out, 1);
+    }
+  }
+  return 0;
+}
+
+static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_out_width  = (out_width + 1) >> 1;
+  const int uv_out_height = (out_height + 1) >> 1;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
+  size_t tmp_size;
+  int32_t* work;
+
+  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
+  if (has_alpha) {
+    tmp_size += work_size * sizeof(*work);
+  }
+  p->memory = WebPSafeCalloc(1ULL, tmp_size);
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   buf->y, out_width, out_height, buf->y_stride, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size + uv_work_size);
+  p->emit = EmitRescaledYUV;
+
+  if (has_alpha) {
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     buf->a, out_width, out_height, buf->a_stride, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + work_size + 2 * uv_work_size);
+    p->emit_alpha = EmitRescaledAlphaYUV;
+    WebPInitAlphaProcessing();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// RGBA rescaling
+
+static int ExportRGB(WebPDecParams* const p, int y_pos) {
+  const WebPYUV444Converter convert =
+      WebPYUV444Converters[p->output->colorspace];
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  // For RGB rescaling, because of the YUV420, current scan position
+  // U/V can be +1/-1 line from the Y one.  Hence the double test.
+  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
+         WebPRescalerHasPendingOutput(&p->scaler_u)) {
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
+    WebPRescalerExportRow(&p->scaler_y, 0);
+    WebPRescalerExportRow(&p->scaler_u, 0);
+    WebPRescalerExportRow(&p->scaler_v, 0);
+    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
+            dst, p->scaler_y.dst_width);
+    dst += buf->stride;
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  int j = 0, uv_j = 0;
+  int num_lines_out = 0;
+  while (j < mb_h) {
+    const int y_lines_in =
+        WebPRescalerImport(&p->scaler_y, mb_h - j,
+                           io->y + j * io->y_stride, io->y_stride);
+    const int u_lines_in =
+        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                           io->u + uv_j * io->uv_stride, io->uv_stride);
+    const int v_lines_in =
+        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                           io->v + uv_j * io->uv_stride, io->uv_stride);
+    (void)v_lines_in;   // remove a gcc warning
+    assert(u_lines_in == v_lines_in);
+    j += y_lines_in;
+    uv_j += u_lines_in;
+    num_lines_out += ExportRGB(p, num_lines_out);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int alpha_first =
+      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+  int num_lines_out = 0;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
+  const int width = p->scaler_a.dst_width;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a, 0);
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0xff) {
+    WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                           width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+#ifdef WEBP_SWAP_16BIT_CSP
+  uint8_t* alpha_dst = base_rgba;
+#else
+  uint8_t* alpha_dst = base_rgba + 1;
+#endif
+  int num_lines_out = 0;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int width = p->scaler_a.dst_width;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a, 0);
+    for (i = 0; i < width; ++i) {
+      // Fill in the alpha value (converted to 4 bits).
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha_dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0x0f) {
+    WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a != NULL) {
+    WebPRescaler* const scaler = &p->scaler_a;
+    int j = 0;
+    int pos = 0;
+    while (j < io->mb_h) {
+      j += WebPRescalerImport(scaler, io->mb_h - j,
+                              io->a + j * io->width, io->width);
+      pos += p->emit_alpha_row(p, pos);
+    }
+  }
+  return 0;
+}
+
+static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  int32_t* work;  // rescalers work area
+  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
+  size_t tmp_size1, tmp_size2, total_size;
+
+  tmp_size1 = 3 * work_size;
+  tmp_size2 = 3 * out_width;
+  if (has_alpha) {
+    tmp_size1 += work_size;
+    tmp_size2 += out_width;
+  }
+  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
+  p->memory = WebPSafeCalloc(1ULL, total_size);
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  tmp = (uint8_t*)(work + tmp_size1);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   tmp + 0 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work + 0 * work_size);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   tmp + 1 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 1 * work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   tmp + 2 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 2 * work_size);
+  p->emit = EmitRescaledRGB;
+
+  if (has_alpha) {
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     tmp + 3 * out_width, out_width, out_height, 0, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + 3 * work_size);
+    p->emit_alpha = EmitRescaledAlphaRGB;
+    if (p->output->colorspace == MODE_RGBA_4444 ||
+        p->output->colorspace == MODE_rgbA_4444) {
+      p->emit_alpha_row = ExportAlphaRGBA4444;
+    } else {
+      p->emit_alpha_row = ExportAlpha;
+    }
+    WebPInitAlphaProcessing();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Default custom functions
+
+static int CustomSetup(VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int is_rgb = WebPIsRGBMode(colorspace);
+  const int is_alpha = WebPIsAlphaMode(colorspace);
+
+  p->memory = NULL;
+  p->emit = NULL;
+  p->emit_alpha = NULL;
+  p->emit_alpha_row = NULL;
+  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
+    return 0;
+  }
+  if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
+    WebPInitUpsamplers();
+  }
+  if (io->use_scaling) {
+    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
+    if (!ok) {
+      return 0;    // memory error
+    }
+  } else {
+    if (is_rgb) {
+      p->emit = EmitSampledRGB;   // default
+      if (io->fancy_upsampling) {
+#ifdef FANCY_UPSAMPLING
+        const int uv_width = (io->mb_w + 1) >> 1;
+        p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
+        if (p->memory == NULL) {
+          return 0;   // memory error.
+        }
+        p->tmp_y = (uint8_t*)p->memory;
+        p->tmp_u = p->tmp_y + io->mb_w;
+        p->tmp_v = p->tmp_u + uv_width;
+        p->emit = EmitFancyRGB;
+        WebPInitUpsamplers();
+#endif
+      } else {
+        WebPInitSamplers();
+      }
+    } else {
+      p->emit = EmitYUV;
+    }
+    if (is_alpha) {  // need transparency output
+      p->emit_alpha =
+          (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
+              EmitAlphaRGBA4444
+          : is_rgb ? EmitAlphaRGB
+          : EmitAlphaYUV;
+      if (is_rgb) {
+        WebPInitAlphaProcessing();
+      }
+    }
+  }
+
+  if (is_rgb) {
+    VP8YUVInit();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int CustomPut(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int num_lines_out;
+  assert(!(io->mb_y & 1));
+
+  if (mb_w <= 0 || mb_h <= 0) {
+    return 0;
+  }
+  num_lines_out = p->emit(io, p);
+  if (p->emit_alpha != NULL) {
+    p->emit_alpha(io, p);
+  }
+  p->last_y += num_lines_out;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void CustomTeardown(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  WebPSafeFree(p->memory);
+  p->memory = NULL;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
+  io->put      = CustomPut;
+  io->setup    = CustomSetup;
+  io->teardown = CustomTeardown;
+  io->opaque   = params;
+}
+
+//------------------------------------------------------------------------------
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@ -1,21 +1,19 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "vp8i.h"
+#include "./vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-static inline int clip(int v, int M) {
+static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
 }

@ -58,7 +56,7 @@ static const uint16_t kAcTable[128] = {
  249, 254, 259, 264, 269, 274, 279, 284
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 9.6

 void VP8ParseQuant(VP8Decoder* const dec) {
@ -94,18 +92,19 @@ void VP8ParseQuant(VP8Decoder* const dec) {
      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];

      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
-      // TODO(skal): make it another table?
-      m->y2_mat_[1] = kAcTable[clip(q + dqy2_ac, 127)] * 155 / 100;
+      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+      // word size.
+      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;

      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
    }
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -1,22 +1,21 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "vp8i.h"
+#include "./vp8i.h"
+#include "../utils/bit_reader_inl.h"

 #define USE_GENERIC_TREE

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
@ -31,61 +30,12 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif

-#ifndef ONLY_KEYFRAME_CODE
-
-// inter prediction modes
-enum {
-  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
-  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
-
-static const int8_t kYModesInter[8] = {
-  -DC_PRED, 1,
-    2, 3,
-      -V_PRED, -H_PRED,
-      -TM_PRED, -B_PRED
-};
-
-static const int8_t kMBSplit[6] = {
-  -3, 1,
-    -2, 2,
-      -0, -1
-};
-
-static const int8_t kMVRef[8] = {
-  -ZEROMV, 1,
-    -NEARESTMV, 2,
-      -NEARMV, 3,
-        -NEWMV, -SPLITMV
-};
-
-static const int8_t kMVRef4[6] = {
-  -LEFT4, 1
-    -ABOVE4, 2
-      -ZERO4, -NEW4
-};
-#endif
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Default probabilities

-// Inter
-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
-static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
-static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
-  { 162, 128, 225, 146, 172, 147, 214,  39,
-    156, 128, 129, 132,  75, 145, 178, 206,
-    239, 254, 254 },
-  { 164, 128, 204, 170, 119, 235, 140, 230,
-    228, 128, 130, 130,  74, 148, 180, 203,
-    236, 254, 254 }
-};
-#endif
-
 // Paragraph 13.5
 static const uint8_t
  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
-  // genereated using vp8_default_coef_probs() in entropy.c:129
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@ -326,28 +276,38 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {

 void VP8ResetProba(VP8Proba* const proba) {
  memset(proba->segments_, 255u, sizeof(proba->segments_));
-  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
-#ifndef ONLY_KEYFRAME_CODE
-  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
-  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
-  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
-#endif
+  // proba->bands_[][] is initialized later
 }

-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
-  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
+static void ParseIntraMode(VP8BitReader* const br,
+                           VP8Decoder* const dec, int mb_x) {
+  uint8_t* const top = dec->intra_t_ + 4 * mb_x;
  uint8_t* const left = dec->intra_l_;
-  // Hardcoded 16x16 intra-mode decision tree.
-  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
-  if (!dec->is_i4x4_) {
+  VP8MBData* const block = dec->mb_data_ + mb_x;
+
+  // Note: we don't save segment map (yet), as we don't expect
+  // to decode more than 1 keyframe.
+  if (dec->segment_hdr_.update_map_) {
+    // Hardcoded tree parsing
+    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
+                    ? VP8GetBit(br, dec->proba_.segments_[1])
+                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+  } else {
+    block->segment_ = 0;  // default for intra
+  }
+  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
+
+  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!block->is_i4x4_) {
+    // Hardcoded 16x16 intra-mode decision tree.
    const int ymode =
        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    dec->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(top[0]));
-    memset(left, ymode, 4 * sizeof(left[0]));
+    block->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(*top));
+    memset(left, ymode, 4 * sizeof(*left));
  } else {
-    uint8_t* modes = dec->imodes_;
+    uint8_t* modes = block->imodes_;
    int y;
    for (y = 0; y < 4; ++y) {
      int ymode = left[y];
@ -356,10 +316,10 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
        // Generic tree-parsing
-        int i = 0;
-        do {
+        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        while (i > 0) {
          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        } while (i > 0);
+        }
        ymode = -i;
 #else
        // Hardcoded tree parsing
@ -374,18 +334,27 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
        top[x] = ymode;
-        *modes++ = ymode;
      }
+      memcpy(modes, top, 4 * sizeof(*top));
+      modes += 4;
      left[y] = ymode;
    }
  }
  // Hardcoded UVMode decision tree
-  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-               : !VP8GetBit(br, 114) ? V_PRED
-               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+                 : !VP8GetBit(br, 114) ? V_PRED
+                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }

-//-----------------------------------------------------------------------------
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
+  int mb_x;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    ParseIntraMode(br, dec, mb_x);
+  }
+  return !dec->br_.eof_;
+}
+
+//------------------------------------------------------------------------------
 // Paragraph 13

 static const uint8_t
@ -524,17 +493,6 @@ static const uint8_t
  }
 };

-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
-  { 237, 246, 253, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 250, 250,
-    252, 254, 254 },
-  { 231, 243, 245, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 251, 251,
-    254, 254, 254 }
-};
-#endif
-
 // Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
@ -543,9 +501,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
    for (b = 0; b < NUM_BANDS; ++b) {
      for (c = 0; c < NUM_CTX; ++c) {
        for (p = 0; p < NUM_PROBAS; ++p) {
-          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
-            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
-          }
+          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
+                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          proba->bands_[t][b].probas_[c][p] = v;
        }
      }
    }
@ -554,36 +512,5 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  if (dec->use_skip_proba_) {
    dec->skip_p_ = VP8GetValue(br, 8);
  }
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->frm_hdr_.key_frame_) {
-    int i;
-    dec->intra_p_ = VP8GetValue(br, 8);
-    dec->last_p_ = VP8GetValue(br, 8);
-    dec->golden_p_ = VP8GetValue(br, 8);
-    if (VP8Get(br)) {   // update y-mode
-      for (i = 0; i < 4; ++i) {
-        proba->ymode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    if (VP8Get(br)) {   // update uv-mode
-      for (i = 0; i < 3; ++i) {
-        proba->uvmode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    // update MV
-    for (i = 0; i < 2; ++i) {
-      int k;
-      for (k = 0; k < NUM_MV_PROBAS; ++k) {
-        if (VP8GetBit(br, MVUpdateProba[i][k])) {
-          const int v = VP8GetValue(br, 7);
-          proba->mv_[i][k] = v ? v << 1 : 1;
-        }
-      }
-    }
-  }
-#endif
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@ -10,19 +12,21 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "./alphai.h"
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../utils/bit_reader_inl.h"
+#include "../utils/utils.h"

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 int WebPGetDecoderVersion(void) {
  return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Decoder

 static void SetOk(VP8Decoder* const dec) {
@ -31,19 +35,22 @@ static void SetOk(VP8Decoder* const dec) {
 }

 int VP8InitIoInternal(VP8Io* const io, int version) {
-  if (version != WEBP_DECODER_ABI_VERSION)
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return 0;  // mismatch error
-  if (io) {
+  }
+  if (io != NULL) {
    memset(io, 0, sizeof(*io));
  }
  return 1;
 }

 VP8Decoder* VP8New(void) {
-  VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
-  if (dec) {
+  VP8Decoder* const dec = (VP8Decoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
+  if (dec != NULL) {
    SetOk(dec);
+    WebPGetWorkerInterface()->Init(&dec->worker_);
    dec->ready_ = 0;
+    dec->num_parts_ = 1;
  }
  return dec;
 }
@ -54,31 +61,86 @@ VP8StatusCode VP8Status(VP8Decoder* const dec) {
 }

 const char* VP8StatusMessage(VP8Decoder* const dec) {
-  if (!dec) return "no object";
+  if (dec == NULL) return "no object";
  if (!dec->error_msg_) return "OK";
  return dec->error_msg_;
 }

 void VP8Delete(VP8Decoder* const dec) {
-  if (dec) {
+  if (dec != NULL) {
    VP8Clear(dec);
-    free(dec);
+    WebPSafeFree(dec);
  }
 }

 int VP8SetError(VP8Decoder* const dec,
-                VP8StatusCode error, const char * const msg) {
-  dec->status_ = error;
-  dec->error_msg_ = msg;
-  dec->ready_ = 0;
+                VP8StatusCode error, const char* const msg) {
+  // TODO This check would be unnecessary if alpha decompression was separated
+  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
+  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
+  // failure.
+  if (dec->status_ == VP8_STATUS_OK) {
+    dec->status_ = error;
+    dec->error_msg_ = msg;
+    dec->ready_ = 0;
+  }
  return 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
+  return (data_size >= 3 &&
+          data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
+}
+
+int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
+               int* const width, int* const height) {
+  if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
+    return 0;         // not enough data
+  }
+  // check signature
+  if (!VP8CheckSignature(data + 3, data_size - 3)) {
+    return 0;         // Wrong signature.
+  } else {
+    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
+    const int key_frame = !(bits & 1);
+    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
+    const int h = ((data[9] << 8) | data[8]) & 0x3fff;
+
+    if (!key_frame) {   // Not a keyframe.
+      return 0;
+    }
+
+    if (((bits >> 1) & 7) > 3) {
+      return 0;         // unknown profile
+    }
+    if (!((bits >> 4) & 1)) {
+      return 0;         // first frame is invisible!
+    }
+    if (((bits >> 5)) >= chunk_size) {  // partition_length
+      return 0;         // inconsistent size information.
+    }
+    if (w == 0 || h == 0) {
+      return 0;         // We don't support both width and height to be zero.
+    }
+
+    if (width) {
+      *width = w;
+    }
+    if (height) {
+      *height = h;
+    }
+
+    return 1;
+  }
+}
+
+//------------------------------------------------------------------------------
 // Header parsing

 static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
-  assert(hdr);
+  assert(hdr != NULL);
  hdr->use_segment_ = 0;
  hdr->update_map_ = 0;
  hdr->absolute_delta_ = 1;
@ -89,8 +151,8 @@ static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
 // Paragraph 9.3
 static int ParseSegmentHeader(VP8BitReader* br,
                              VP8SegmentHeader* hdr, VP8Proba* proba) {
-  assert(br);
-  assert(hdr);
+  assert(br != NULL);
+  assert(hdr != NULL);
  hdr->use_segment_ = VP8Get(br);
  if (hdr->use_segment_) {
    hdr->update_map_ = VP8Get(br);
@ -126,7 +188,7 @@ static int ParseSegmentHeader(VP8BitReader* br,
 // is returned, and this is an unrecoverable error.
 // If the partitions were positioned ok, VP8_STATUS_OK is returned.
 static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
-                                     const uint8_t* buf, uint32_t size) {
+                                     const uint8_t* buf, size_t size) {
  VP8BitReader* const br = &dec->br_;
  const uint8_t* sz = buf;
  const uint8_t* buf_end = buf + size;
@ -177,31 +239,13 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
    }
  }
  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
-  if (dec->filter_type_ > 0) {    // precompute filter levels per segment
-    if (dec->segment_hdr_.use_segment_) {
-      int s;
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        int strength = dec->segment_hdr_.filter_strength_[s];
-        if (!dec->segment_hdr_.absolute_delta_) {
-          strength += hdr->level_;
-        }
-        dec->filter_levels_[s] = strength;
-      }
-    } else {
-      dec->filter_levels_[0] = hdr->level_;
-    }
-  }
  return !br->eof_;
 }

-static inline uint32_t get_le32(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
-}
-
 // Topmost call
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
-  uint8_t* buf;
-  uint32_t buf_size;
+  const uint8_t* buf;
+  size_t buf_size;
  VP8FrameHeader* frm_hdr;
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
@ -215,42 +259,11 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                       "null VP8Io passed to VP8GetHeaders()");
  }
-
-  buf = (uint8_t *)io->data;
+  buf = io->data;
  buf_size = io->data_size;
-  if (buf == NULL || buf_size <= 4) {
+  if (buf_size < 4) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                       "Not enough data to parse frame header");
-  }
-
-  // Skip over valid RIFF headers
-  if (!memcmp(buf, "RIFF", 4)) {
-    uint32_t riff_size;
-    uint32_t chunk_size;
-    if (buf_size < 20 + 4) {
-      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                         "RIFF: Truncated header.");
-    }
-    if (memcmp(buf + 8, "WEBP", 4)) {   // wrong image file signature
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: WEBP signature not found.");
-    }
-    riff_size = get_le32(buf + 4);
-    if (riff_size < 12) {
-      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                         "RIFF: Truncated header.");
-    }
-    if (memcmp(buf + 12, "VP8 ", 4)) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Invalid compression format.");
-    }
-    chunk_size = get_le32(buf + 16);
-    if (chunk_size > riff_size - 12) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Inconsistent size information.");
-    }
-    buf += 20;
-    buf_size -= 20;
+                       "Truncated header.");
  }

  // Paragraph 9.1
@ -278,7 +291,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                         "cannot parse picture header");
    }
-    if (buf[0] != 0x9d || buf[1] != 0x01 || buf[2] != 0x2a) {
+    if (!VP8CheckSignature(buf, buf_size)) {
      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                         "Bad code word");
    }
@ -291,12 +304,20 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    // Setup default output area (can be later modified during io->setup())
    io->width = pic_hdr->width_;
    io->height = pic_hdr->height_;
+    io->use_scaling  = 0;
+    io->use_cropping = 0;
+    io->crop_top  = 0;
+    io->crop_left = 0;
+    io->crop_right  = io->width;
+    io->crop_bottom = io->height;
+    io->mb_w = io->width;   // sanity check
+    io->mb_h = io->height;  // ditto

    VP8ResetProba(&dec->proba_);
    ResetSegmentHeader(&dec->segment_hdr_);
-    dec->segment_ = 0;    // default for intra
  }

  // Check if we have all the partition #0 available, and initialize dec->br_
@ -305,6 +326,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "bad partition length");
  }
+
  br = &dec->br_;
  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
  buf += frm_hdr->partition_length_;
@ -333,38 +355,11 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

  // Frame buffer marking
  if (!frm_hdr->key_frame_) {
-    // Paragraph 9.7
-#ifndef ONLY_KEYFRAME_CODE
-    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
-    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
-    if (!(dec->buffer_flags_ & 1)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
-    }
-    if (!(dec->buffer_flags_ & 2)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
-    }
-    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
-    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
-#else
    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                       "Not a key frame.");
-#endif
-  } else {
-    dec->buffer_flags_ = 0x003 | 0x100;
  }

-  // Paragraph 9.8
-#ifndef ONLY_KEYFRAME_CODE
-  dec->update_proba_ = VP8Get(br);
-  if (!dec->update_proba_) {    // save for later restore
-    dec->proba_saved_ = dec->proba_;
-  }
-  dec->buffer_flags_ &= 1 << 8;
-  dec->buffer_flags_ |=
-      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
-#else
-  VP8Get(br);   // just ignore the value of update_proba_
-#endif
+  VP8Get(br);   // ignore the value of update_proba_

  VP8ParseProba(br, dec);

@ -373,10 +368,10 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)

-static const uint8_t kBands[16 + 1] = {
+static const int kBands[16 + 1] = {
  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  0  // extra entry as sentinel
 };
@ -386,246 +381,238 @@ static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t kCat6[] =
  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
-static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
 static const uint8_t kZigzag[16] = {
  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };

-typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
-
-// Returns the position of the last non-zero coeff plus one
-// (and 0 if there's no coeff at all)
-static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
-                     int ctx, const uint16_t dq[2], int n, int16_t* out) {
-  const uint8_t* p = prob[kBands[n]][ctx];
-  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
-    return 0;
-  }
-  while (1) {
-    ++n;
-    if (!VP8GetBit(br, p[1])) {
-      p = prob[kBands[n]][0];
-    } else {  // non zero coeff
-      int v, j;
-      if (!VP8GetBit(br, p[2])) {
-        p = prob[kBands[n]][1];
-        v = 1;
+// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
+  int v;
+  if (!VP8GetBit(br, p[3])) {
+    if (!VP8GetBit(br, p[4])) {
+      v = 2;
+    } else {
+      v = 3 + VP8GetBit(br, p[5]);
+    }
+  } else {
+    if (!VP8GetBit(br, p[6])) {
+      if (!VP8GetBit(br, p[7])) {
+        v = 5 + VP8GetBit(br, 159);
      } else {
-        if (!VP8GetBit(br, p[3])) {
-          if (!VP8GetBit(br, p[4])) {
-            v = 2;
-          } else {
-            v = 3 + VP8GetBit(br, p[5]);
-          }
-        } else {
-          if (!VP8GetBit(br, p[6])) {
-            if (!VP8GetBit(br, p[7])) {
-              v = 5 + VP8GetBit(br, 159);
-            } else {
-              v = 7 + 2 * VP8GetBit(br, 165) + VP8GetBit(br, 145);
-            }
-          } else {
-            const uint8_t* tab;
-            const int bit1 = VP8GetBit(br, p[8]);
-            const int bit0 = VP8GetBit(br, p[9 + bit1]);
-            const int cat = 2 * bit1 + bit0;
-            v = 0;
-            for (tab = kCat3456[cat]; *tab; ++tab) {
-              v += v + VP8GetBit(br, *tab);
-            }
-            v += 3 + (8 << cat);
-          }
-        }
-        p = prob[kBands[n]][2];
+        v = 7 + 2 * VP8GetBit(br, 165);
+        v += VP8GetBit(br, 145);
      }
-      j = kZigzag[n - 1];
-      out[j] = VP8GetSigned(br, v) * dq[j > 0];
-      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
-        return n;
+    } else {
+      const uint8_t* tab;
+      const int bit1 = VP8GetBit(br, p[8]);
+      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int cat = 2 * bit1 + bit0;
+      v = 0;
+      for (tab = kCat3456[cat]; *tab; ++tab) {
+        v += v + VP8GetBit(br, *tab);
      }
-    }
-    if (n == 16) {
-      return 16;
+      v += 3 + (8 << cat);
    }
  }
+  return v;
 }

-// Alias-safe way of converting 4bytes to 32bits.
-typedef union {
-  uint8_t  i8[4];
-  uint32_t i32;
-} PackedNz;
+// Returns the position of the last non-zero coeff plus one
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
+                     int ctx, const quant_t dq, int n, int16_t* out) {
+  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
+  const uint8_t* p = prob[n].probas_[ctx];
+  for (; n < 16; ++n) {
+    if (!VP8GetBit(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+      p = prob[kBands[++n]].probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
+      int v;
+      if (!VP8GetBit(br, p[2])) {
+        v = 1;
+        p = p_ctx[1];
+      } else {
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
+      }
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+    }
+  }
+  return 16;
+}

-// Table to unpack four bits into four bytes
-static const PackedNz kUnpackTab[16] = {
-  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
-  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
-  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
-  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
+static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
+  nz_coeffs <<= 2;
+  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
+  return nz_coeffs;
+}

-// Macro to pack four LSB of four bytes into four bits.
-#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
-    defined(__BIG_ENDIAN__)
-#define PACK_CST 0x08040201U
-#else
-#define PACK_CST 0x01020408U
-#endif
-#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
-
-static void ParseResiduals(VP8Decoder* const dec,
-                           VP8MB* const mb, VP8BitReader* const token_br) {
-  int out_t_nz, out_l_nz, first;
-  ProbaArray ac_prob;
-  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
-  int16_t* dst = dec->coeffs_;
+static int ParseResiduals(VP8Decoder* const dec,
+                          VP8MB* const mb, VP8BitReader* const token_br) {
+  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
+  const VP8BandProbas* ac_proba;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
+  int16_t* dst = block->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
-  PackedNz nz_ac, nz_dc;
-  PackedNz tnz, lnz;
-  uint32_t non_zero_ac = 0;
-  uint32_t non_zero_dc = 0;
+  uint8_t tnz, lnz;
+  uint32_t non_zero_y = 0;
+  uint32_t non_zero_uv = 0;
  int x, y, ch;
+  uint32_t out_t_nz, out_l_nz;
+  int first;

  memset(dst, 0, 384 * sizeof(*dst));
-  if (!dec->is_i4x4_) {    // parse DC
+  if (!block->is_i4x4_) {    // parse DC
    int16_t dc[16] = { 0 };
-    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
-    mb->dc_nz_ = left_mb->dc_nz_ =
-        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
-                   ctx, q->y2_mat_, 0, dc) > 0);
+    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
+    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
+    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
+    if (nz > 1) {   // more than just the DC -> perform the full transform
+      VP8TransformWHT(dc, dst);
+    } else {        // only DC is non-zero -> inlined simplified transform
+      int i;
+      const int dc0 = (dc[0] + 3) >> 3;
+      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
+    }
    first = 1;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
-    VP8TransformWHT(dc, dst);
+    ac_proba = bands[0];
  } else {
    first = 0;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+    ac_proba = bands[3];
  }

-  tnz = kUnpackTab[mb->nz_ & 0xf];
-  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  tnz = mb->nz_ & 0x0f;
+  lnz = left_mb->nz_ & 0x0f;
  for (y = 0; y < 4; ++y) {
-    int l = lnz.i8[y];
+    int l = lnz & 1;
+    uint32_t nz_coeffs = 0;
    for (x = 0; x < 4; ++x) {
-      const int ctx = l + tnz.i8[x];
-      const int nz = GetCoeffs(token_br, ac_prob, ctx,
-                               q->y1_mat_, first, dst);
-      tnz.i8[x] = l = (nz > 0);
-      nz_dc.i8[x] = (dst[0] != 0);
-      nz_ac.i8[x] = (nz > 1);
+      const int ctx = l + (tnz & 1);
+      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
+      l = (nz > first);
+      tnz = (tnz >> 1) | (l << 7);
+      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
      dst += 16;
    }
-    lnz.i8[y] = l;
-    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
-    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+    tnz >>= 4;
+    lnz = (lnz >> 1) | (l << 7);
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
  }
-  out_t_nz = PACK(tnz, 24);
-  out_l_nz = PACK(lnz, 24);
+  out_t_nz = tnz;
+  out_l_nz = lnz >> 4;

-  tnz = kUnpackTab[mb->nz_ >> 4];
-  lnz = kUnpackTab[left_mb->nz_ >> 4];
  for (ch = 0; ch < 4; ch += 2) {
+    uint32_t nz_coeffs = 0;
+    tnz = mb->nz_ >> (4 + ch);
+    lnz = left_mb->nz_ >> (4 + ch);
    for (y = 0; y < 2; ++y) {
-      int l = lnz.i8[ch + y];
+      int l = lnz & 1;
      for (x = 0; x < 2; ++x) {
-        const int ctx = l + tnz.i8[ch + x];
-        const int nz =
-            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
-                      ctx, q->uv_mat_, 0, dst);
-        tnz.i8[ch + x] = l = (nz > 0);
-        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
-        nz_ac.i8[y * 2 + x] = (nz > 1);
+        const int ctx = l + (tnz & 1);
+        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
+        l = (nz > 0);
+        tnz = (tnz >> 1) | (l << 3);
+        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
        dst += 16;
      }
-      lnz.i8[ch + y] = l;
+      tnz >>= 2;
+      lnz = (lnz >> 1) | (l << 5);
    }
-    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
-    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+    // Note: we don't really need the per-4x4 details for U/V blocks.
+    non_zero_uv |= nz_coeffs << (4 * ch);
+    out_t_nz |= (tnz << 4) << ch;
+    out_l_nz |= (lnz & 0xf0) << ch;
  }
-  out_t_nz |= PACK(tnz, 20);
-  out_l_nz |= PACK(lnz, 20);
  mb->nz_ = out_t_nz;
  left_mb->nz_ = out_l_nz;

-  dec->non_zero_ac_ = non_zero_ac;
-  dec->non_zero_ = non_zero_ac | non_zero_dc;
-  mb->skip_ = !dec->non_zero_;
-}
-#undef PACK
+  block->non_zero_y_ = non_zero_y;
+  block->non_zero_uv_ = non_zero_uv;

-//-----------------------------------------------------------------------------
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
+  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
+}
+
+//------------------------------------------------------------------------------
 // Main loop

 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
-  VP8BitReader* const br = &dec->br_;
  VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int skip = dec->use_skip_proba_ ? block->skip_ : 0;

-  // Note: we don't save segment map (yet), as we don't expect
-  // to decode more than 1 keyframe.
-  if (dec->segment_hdr_.update_map_) {
-    // Hardcoded tree parsing
-    dec->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) ?
-        VP8GetBit(br, dec->proba_.segments_[1]) :
-        2 + VP8GetBit(br, dec->proba_.segments_[2]);
-  }
-  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
-
-  VP8ParseIntraMode(br, dec);
-  if (br->eof_) {
-    return 0;
-  }
-
-  if (!info->skip_) {
-    ParseResiduals(dec, info, token_br);
+  if (!skip) {
+    skip = ParseResiduals(dec, mb, token_br);
  } else {
-    left->nz_ = info->nz_ = 0;
-    if (!dec->is_i4x4_) {
-      left->dc_nz_ = info->dc_nz_ = 0;
+    left->nz_ = mb->nz_ = 0;
+    if (!block->is_i4x4_) {
+      left->nz_dc_ = mb->nz_dc_ = 0;
    }
-    dec->non_zero_ = 0;
-    dec->non_zero_ac_ = 0;
+    block->non_zero_y_ = 0;
+    block->non_zero_uv_ = 0;
  }

-  return (!token_br->eof_);
+  if (dec->filter_type_ > 0) {  // store filter info
+    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
+    *finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
+    finfo->f_inner_ |= !skip;
+  }
+
+  return !token_br->eof_;
+}
+
+void VP8InitScanline(VP8Decoder* const dec) {
+  VP8MB* const left = dec->mb_info_ - 1;
+  left->nz_ = 0;
+  left->nz_dc_ = 0;
+  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+  dec->mb_x_ = 0;
 }

 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
-  for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
-    VP8MB* const left = dec->mb_info_ - 1;
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    // Parse bitstream for this row.
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-
-    left->nz_ = 0;
-    left->dc_nz_ = 0;
-    memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-
-    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                         "Premature end-of-partition0 encountered.");
+    }
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
-      VP8ReconstructBlock(dec);
-
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);
    }
-    if (!VP8FinishRow(dec, io)) {
-      return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                         "Output aborted.");
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
+    if (!VP8ProcessRow(dec, io)) {
+      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
  }
-
-  // Finish
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->update_proba_) {
-    dec->proba_ = dec->proba_saved_;
+  if (dec->mt_method_ > 0) {
+    if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) return 0;
  }
-#endif

  return 1;
 }

 // Main entry point
 int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 0;
  if (dec == NULL) {
    return 0;
  }
@ -641,53 +628,41 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
  }
  assert(dec->ready_);

-  // will allocate memory and prepare everything.
-  if (!VP8InitFrame(dec, io)) {
-    VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                       "Allocation failed");
+  // Finish setting up the decoding parameter. Will call io->setup().
+  ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
+  if (ok) {   // good to go.
+    // Will allocate memory and prepare everything.
+    if (ok) ok = VP8InitFrame(dec, io);
+
+    // Main decoding loop
+    if (ok) ok = ParseFrame(dec, io);
+
+    // Exit.
+    ok &= VP8ExitCritical(dec, io);
  }

-  if (io->setup && !io->setup(io)) {
+  if (!ok) {
    VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                       "Frame setup failed");
-  }
-
-  // Disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
-
-  // Main decoding loop
-  {
-    const int ret = ParseFrame(dec, io);
-    if (io->teardown) {
-      io->teardown(io);
-    }
-    if (!ret) {
-      VP8Clear(dec);
-      return 0;
-    }
+    return 0;
  }

  dec->ready_ = 0;
-  return 1;
+  return ok;
 }

 void VP8Clear(VP8Decoder* const dec) {
  if (dec == NULL) {
    return;
  }
-  if (dec->mem_) {
-    free(dec->mem_);
-  }
+  WebPGetWorkerInterface()->End(&dec->worker_);
+  ALPHDelete(dec->alph_dec_);
+  dec->alph_dec_ = NULL;
+  WebPSafeFree(dec->mem_);
  dec->mem_ = NULL;
  dec->mem_size_ = 0;
  memset(&dec->br_, 0, sizeof(dec->br_));
  dec->ready_ = 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
@ -13,22 +15,24 @@
 #define WEBP_DEC_VP8I_H_

 #include <string.h>     // for memcpy()
-#include "bits.h"
+#include "./vp8li.h"
+#include "../utils/bit_reader.h"
+#include "../utils/random.h"
+#include "../utils/thread.h"
+#include "../dsp/dsp.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Various defines and enums

 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 1
+#define DEC_MIN_VERSION 4
 #define DEC_REV_VERSION 2

-#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
-
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
       B_TM_PRED,
@ -95,7 +99,10 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)

-//-----------------------------------------------------------------------------
+// minimal width under which lossy multi-threading is always disabled
+#define MIN_WIDTH_FOR_THREADS 512
+
+//------------------------------------------------------------------------------
 // Headers

 typedef struct {
@ -123,15 +130,19 @@ typedef struct {
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;

+
+// probas associated to one of the contexts
+typedef uint8_t VP8ProbaArray[NUM_PROBAS];
+
+typedef struct {   // all the probas associated to one band
+  VP8ProbaArray probas_[NUM_CTX];
+} VP8BandProbas;
+
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t ymode_[4], uvmode_[3];
-  uint8_t mv_[2][NUM_MV_PROBAS];
-#endif
+  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
 } VP8Proba;

 // Filter parameters
@ -144,27 +155,66 @@ typedef struct {
  int mode_lf_delta_[NUM_MODE_LF_DELTAS];
 } VP8FilterHeader;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Informations about the macroblocks.

-typedef struct {
-  // block type
-  uint8_t skip_:1;
-  // filter specs
-  uint8_t f_level_:6;      // filter strength: 0..63
-  uint8_t f_ilevel_:6;     // inner limit: 1..63
-  uint8_t f_inner_:1;      // do inner filtering?
-  // cbp
-  uint8_t nz_;        // non-zero AC/DC coeffs
-  uint8_t dc_nz_;     // non-zero DC coeffs
+typedef struct {  // filter specs
+  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
+  uint8_t f_ilevel_;     // inner limit in [1..63]
+  uint8_t f_inner_;      // do inner filtering?
+  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
+} VP8FInfo;
+
+typedef struct {  // Top/Left Contexts used for syntax-parsing
+  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
+  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
 } VP8MB;

 // Dequantization matrices
+typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
-  uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2];    // [DC / AC]
+  quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;

-//-----------------------------------------------------------------------------
+// Data needed to reconstruct a macroblock
+typedef struct {
+  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
+  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
+  //   code=0 -> no coefficient
+  //   code=1 -> only DC
+  //   code=2 -> first three coefficients are non-zero
+  //   code=3 -> more than three coefficients are non-zero
+  // This allows to call specialized transform functions.
+  uint32_t non_zero_y_;
+  uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
+  uint8_t skip_;
+  uint8_t segment_;
+} VP8MBData;
+
+// Persistent information needed by the parallel processing
+typedef struct {
+  int id_;              // cache row to process (in [0..2])
+  int mb_y_;            // macroblock position of the row
+  int filter_row_;      // true if row-filtering is needed
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
+  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
+  VP8Io io_;            // copy of the VP8Io to pass to put()
+} VP8ThreadContext;
+
+// Saved top samples, per macroblock. Fits into a cache-line.
+typedef struct {
+  uint8_t y[16], u[8], v[8];
+} VP8TopSamples;
+
+//------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user

 struct VP8Decoder {
@ -181,20 +231,29 @@ struct VP8Decoder {
  VP8FilterHeader  filter_hdr_;
  VP8SegmentHeader segment_hdr_;

+  // Worker
+  WebPWorker worker_;
+  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
+                       // 2=[parse][recon+filter]
+  int cache_id_;       // current cache row
+  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
+  VP8ThreadContext thread_ctx_;  // Thread context
+
  // dimension, in macroblock units.
  int mb_w_, mb_h_;

+  // Macroblock to process/filter, depending on cropping and filter_type.
+  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
+  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
+
  // number of partitions.
  int num_parts_;
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];

-  // buffer refresh flags
-  //   bit 0: refresh Gold, bit 1: refresh Alt
-  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
-  //   bit 6: Gold sign bias, bit 7: Alt sign bias
-  //   bit 8: refresh last frame
-  uint32_t buffer_flags_;
+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering

  // dequantization (one set of DC/AC dequant factor per segment)
  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@ -203,23 +262,18 @@ struct VP8Decoder {
  VP8Proba proba_;
  int use_skip_proba_;
  uint8_t skip_p_;
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t intra_p_, last_p_, golden_p_;
-  VP8Proba proba_saved_;
-  int update_proba_;
-#endif

  // Boundary data cache and persistent buffers.
-  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
-  uint8_t  intra_l_[4];  // left intra modes values
-  uint8_t *y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t *u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];   // left intra modes values

-  VP8MB* mb_info_;       // contextual macroblock infos (mb_w_ + 1)
-  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
-  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
+  VP8TopSamples* yuv_t_;  // top y/u/v samples

-  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
+  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;      // filter strength info
+  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
+
+  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
  uint8_t* cache_u_;
  uint8_t* cache_v_;
  int cache_y_stride_;
@ -227,99 +281,74 @@ struct VP8Decoder {

  // main memory chunk for the above data. Persistent.
  void* mem_;
-  int mem_size_;
+  size_t mem_size_;

  // Per macroblock non-persistent infos.
  int mb_x_, mb_y_;       // current position, in macroblock units
-  uint8_t is_i4x4_;       // true if intra4x4
-  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
-  uint8_t uvmode_;        // chroma prediction mode
-  uint8_t segment_;       // block's segment
-
-  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
-  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
-  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
-  // If the bit is set, the 4x4 block contains some non-zero coefficients.
-  uint32_t non_zero_;
-  uint32_t non_zero_ac_;
+  VP8MBData* mb_data_;    // parsed reconstruction data

  // Filtering side-info
-  int filter_type_;                       // 0=off, 1=simple, 2=complex
-  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
+  int filter_type_;                          // 0=off, 1=simple, 2=complex
+  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
+
+  // Alpha
+  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
+  const uint8_t* alpha_data_;     // compressed alpha data (if present)
+  size_t alpha_data_size_;
+  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
+  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
+  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // internal functions. Not public.

 // in vp8.c
 int VP8SetError(VP8Decoder* const dec,
-                VP8StatusCode error, const char * const msg);
+                VP8StatusCode error, const char* const msg);

 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec);
+// parses one row of intra mode data in partition 0, returns !eof
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);

 // in quant.c
 void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
-// Predict a block and add residual
-void VP8ReconstructBlock(VP8Decoder* const dec);
-// Store a block, along with filtering params
-void VP8StoreBlock(VP8Decoder* const dec);
-// Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
+// Call io->setup() and finish setting up scan parameters.
+// After this call returns, one must always call VP8ExitCritical() with the
+// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
+// if ok, otherwise sets and returns the error status on *dec.
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
+// Must always be called in pair with VP8EnterCritical().
+// Returns false in case of error.
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+// Return the multi-threading method to use (0=off), depending
+// on options and bitstream size. Only for lossy decoding.
+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height);
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
+// To be called at the start of a new scanline, to initialize predictors.
+void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);

-// in dsp.c
-typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
-extern VP8Idct VP8Transform;
-extern VP8Idct VP8TransformUV;
-extern VP8Idct VP8TransformDC;
-extern VP8Idct VP8TransformDCUV;
-extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+// in alpha.c
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows);

-// *dst is the destination block, with stride BPS. Boundary samples are
-// assumed accessible when needed.
-typedef void (*VP8PredFunc)(uint8_t *dst);
-extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
-extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
-extern VP8PredFunc VP8PredLuma4[NUM_BMODES];
+//------------------------------------------------------------------------------

-void VP8DspInit(void);        // must be called before anything using the above
-void VP8DspInitTables(void);  // needs to be called no matter what.
-
-// simple filter (only for luma)
-typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
-extern VP8SimpleFilterFunc VP8SimpleVFilter16;
-extern VP8SimpleFilterFunc VP8SimpleHFilter16;
-extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
-extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
-
-// regular filter (on both macroblock edges and inner edges)
-typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
-                                  int thresh, int ithresh, int hev_t);
-typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
-                                    int thresh, int ithresh, int hev_t);
-// on outter edge
-extern VP8LumaFilterFunc VP8VFilter16;
-extern VP8LumaFilterFunc VP8HFilter16;
-extern VP8ChromaFilterFunc VP8VFilter8;
-extern VP8ChromaFilterFunc VP8HFilter8;
-
-// on inner edge
-extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
-extern VP8LumaFilterFunc VP8HFilter16i;
-extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
-extern VP8ChromaFilterFunc VP8HFilter8i;
-
-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

-#endif  // WEBP_DEC_VP8I_H_
+#endif  /* WEBP_DEC_VP8I_H_ */
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@ -0,0 +1,132 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Lossless decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora(vikaas.arora@gmail.com)
+
+#ifndef WEBP_DEC_VP8LI_H_
+#define WEBP_DEC_VP8LI_H_
+
+#include <string.h>     // for memcpy()
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
+#include "../utils/color_cache.h"
+#include "../utils/huffman.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  READ_DATA = 0,
+  READ_HDR = 1,
+  READ_DIM = 2
+} VP8LDecodeState;
+
+typedef struct VP8LTransform VP8LTransform;
+struct VP8LTransform {
+  VP8LImageTransformType type_;   // transform type.
+  int                    bits_;   // subsampling bits defining transform window.
+  int                    xsize_;  // transform window X index.
+  int                    ysize_;  // transform window Y index.
+  uint32_t              *data_;   // transform data.
+};
+
+typedef struct {
+  int             color_cache_size_;
+  VP8LColorCache  color_cache_;
+
+  int             huffman_mask_;
+  int             huffman_subsample_bits_;
+  int             huffman_xsize_;
+  uint32_t       *huffman_image_;
+  int             num_htree_groups_;
+  HTreeGroup     *htree_groups_;
+} VP8LMetadata;
+
+typedef struct VP8LDecoder VP8LDecoder;
+struct VP8LDecoder {
+  VP8StatusCode    status_;
+  VP8LDecodeState  action_;
+  VP8LDecodeState  state_;
+  VP8Io           *io_;
+
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
+  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+                                   // or uint32_t* for BGRA.
+  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+
+  VP8LBitReader    br_;
+
+  int              width_;
+  int              height_;
+  int              last_row_;      // last input row decoded so far.
+  int              last_pixel_;    // last pixel decoded so far. However, it may
+                                   // not be transformed, scaled and
+                                   // color-converted yet.
+  int              last_out_row_;  // last row output so far.
+
+  VP8LMetadata     hdr_;
+
+  int              next_transform_;
+  VP8LTransform    transforms_[NUM_TRANSFORMS];
+  // or'd bitset storing the transforms types.
+  uint32_t         transforms_seen_;
+
+  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+struct ALPHDecoder;  // Defined in dec/alphai.h.
+
+// in vp8l.c
+
+// Decodes image header for alpha data stored using lossless compression.
+// Returns false in case of error.
+int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output);
+
+// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
+// already decoded in previous call(s), it will resume decoding from where it
+// was paused.
+// Returns false in case of bitstream error.
+int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
+                               int last_row);
+
+// Allocates and initialize a new lossless decoder instance.
+VP8LDecoder* VP8LNew(void);
+
+// Decodes the image header. Returns false in case of error.
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+
+// Decodes an image. It's required to decode the lossless header before calling
+// this function. Returns false in case of error, with updated dec->status_.
+int VP8LDecodeImage(VP8LDecoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+void VP8LClear(VP8LDecoder* const dec);
+
+// Clears and deallocate a lossless decoder instance.
+void VP8LDelete(VP8LDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8LI_H_ */
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@ -1,63 +1,120 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#ifndef WEBP_DEC_WEBPI_H
-#define WEBP_DEC_WEBPI_H
+#ifndef WEBP_DEC_WEBPI_H_
+#define WEBP_DEC_WEBPI_H_

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

-#include "webp/decode_vp8.h"
+#include "../utils/rescaler.h"
+#include "./decode_vp8.h"

-// Decoding output parameters.
+//------------------------------------------------------------------------------
+// WebPDecParams: Decoding output parameters. Transient internal object.
+
+typedef struct WebPDecParams WebPDecParams;
+typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
+
+struct WebPDecParams {
+  WebPDecBuffer* output;             // output buffer.
+  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
+                                     // or used for tmp rescaling
+
+  int last_y;                 // coordinate of the line that was last output
+  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
+  // rescalers
+  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+  void* memory;                  // overall scratch memory for the output work.
+
+  OutputFunc emit;               // output RGB or YUV samples
+  OutputFunc emit_alpha;         // output alpha channel
+  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
+};
+
+// Should be called first, before any use of the WebPDecParams object.
+void WebPResetDecParams(WebPDecParams* const params);
+
+//------------------------------------------------------------------------------
+// Header parsing helpers
+
+// Structure storing a description of the RIFF headers.
 typedef struct {
-  uint8_t* output;      // rgb(a) or luma
-  uint8_t *u, *v;       // chroma u/v
-  uint8_t *top_y, *top_u, *top_v;   // cache for the fancy upscaler
-  int stride;           // rgb(a) stride or luma stride
-  int u_stride;         // chroma-u stride
-  int v_stride;         // chroma-v stride
-  WEBP_CSP_MODE mode;   // rgb(a) or yuv
-  int last_y;           // coordinate of the line that was last output
-  int output_size;      // size of 'output' buffer
-  int output_u_size;    // size of 'u' buffer
-  int output_v_size;    // size of 'v' buffer
-  int external_buffer;  // If true, the output buffers are externally owned
-} WebPDecParams;
+  const uint8_t* data;         // input buffer
+  size_t data_size;            // input buffer size
+  int have_all_data;           // true if all data is known to be available
+  size_t offset;               // offset to main data chunk (VP8 or VP8L)
+  const uint8_t* alpha_data;   // points to alpha chunk (if present)
+  size_t alpha_data_size;      // alpha chunk size
+  size_t compressed_size;      // VP8/VP8L compressed data size
+  size_t riff_size;            // size of the riff payload (or 0 if absent)
+  int is_lossless;             // true if a VP8L chunk is present
+} WebPHeaderStructure;

-// If a RIFF container is detected, validate it and skip over it. Returns
-// VP8 bit-stream size if RIFF header is valid else returns 0
-uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t *data_size_ptr);
+// Skips over all valid chunks prior to the first VP8/VP8L frame header.
+// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
+// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
+// in the case of non-decodable features (animation for instance).
+// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
+// fields are updated appropriately upon success.
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);

-// Initializes VP8Io with custom setup, io and teardown functions
-void WebPInitCustomIo(VP8Io* const io);
+//------------------------------------------------------------------------------
+// Misc utils

-// Initializes params_out by allocating output buffer and setting the
-// stride information. It also outputs width and height information of
-// the WebP image. Returns 1 if succeeds.
-int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
-                      int* height, WebPDecParams* const params_out);
+// Initializes VP8Io with custom setup, io and teardown functions. The default
+// hooks will use the supplied 'params' as io->opaque handle.
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);

-// Verifies various size configurations (e.g stride >= width, specified
-// output size <= stride * height etc.). Returns 0 if checks fail.
-int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params);
+// Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
+// to the *compressed* format, not the output one.
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);

-// Deallocate memory allocated by WebPInitDecParams() and reset the
-// WebPDecParams object.
-void WebPClearDecParams(WebPDecParams* params);
+//------------------------------------------------------------------------------
+// Internal functions regarding WebPDecBuffer memory (in buffer.c).
+// Don't really need to be externally visible for now.

-#if defined(__cplusplus) || defined(c_plusplus)
+// Prepare 'buffer' with the requested initial dimensions width/height.
+// If no external storage is supplied, initializes buffer by allocating output
+// memory and setting up the stride information. Validate the parameters. Return
+// an error code in case of problem (no memory, or invalid stride / size /
+// dimension / etc.). If *options is not NULL, also verify that the options'
+// parameters are valid and apply them to the width/height dimensions of the
+// output buffer. This takes cropping / scaling / rotation into account.
+// Also incorporates the options->flip flag to flip the buffer parameters if
+// needed.
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const buffer);
+
+// Flip buffer vertically by negating the various strides.
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
+
+// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
+// memory (still held by 'src').
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst);
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
 }    // extern "C"
 #endif

-#endif  // WEBP_DEC_WEBPI_H
+#endif  /* WEBP_DEC_WEBPI_H_ */
--- a/src/dec/yuv.c
+++ b/src/dec/yuv.c
@ -1,46 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// YUV->RGB conversion function
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "yuv.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-enum { YUV_HALF = 1 << (YUV_FIX - 1) };
-
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-static int done = 0;
-
-void VP8YUVInit(void) {
-  int i;
-  if (done) {
-    return;
-  }
-  for (i = 0; i < 256; ++i) {
-    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
-    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
-    VP8kVToG[i] = -45773 * (i - 128);
-    VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
-  }
-  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
-    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
-    VP8kClip[i - YUV_RANGE_MIN] = (k < 0) ? 0 : (k > 255) ? 255 : k;
-  }
-  done = 1;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/yuv.h
+++ b/src/dec/yuv.h
@ -1,66 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// inline YUV->RGB conversion function
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_DEC_YUV_H_
-#define WEBP_DEC_YUV_H_
-
-#include "webp/decode_vp8.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-enum { YUV_FIX = 16,                // fixed-point precision
-       YUV_RANGE_MIN = -227,        // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
-};
-extern int16_t VP8kVToR[256], VP8kUToB[256];
-extern int32_t VP8kVToG[256], VP8kUToG[256];
-extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-}
-
-inline static void VP8YuvToRgba(int y, int u, int v, uint8_t* const rgba) {
-  VP8YuvToRgb(y, u, v, rgba);
-  rgba[3] = 0xff;
-}
-
-inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const bgr) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-}
-
-inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) {
-  VP8YuvToBgr(y, u, v, bgra);
-  bgra[3] = 0xff;
-}
-
-// Must be called before everything, to initialize the tables.
-void VP8YUVInit(void);
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif  // WEBP_DEC_YUV_H_
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -0,0 +1,14 @@
+lib_LTLIBRARIES = libwebpdemux.la
+
+libwebpdemux_la_SOURCES =
+libwebpdemux_la_SOURCES += demux.c
+
+libwebpdemuxinclude_HEADERS =
+libwebpdemuxinclude_HEADERS += ../webp/demux.h
+libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
+libwebpdemuxinclude_HEADERS += ../webp/types.h
+
+libwebpdemux_la_LIBADD = ../libwebp.la
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:2:0
+libwebpdemuxincludedir = $(includedir)/webp
+pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
--- a/src/demux/libwebpdemux.pc.in
+++ b/src/demux/libwebpdemux.pc.in
@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libwebpdemux
+Description: Library for parsing the WebP graphics format container
+Version: @PACKAGE_VERSION@
+Requires: libwebp >= 0.2.0
+Cflags: -I${includedir}
+Libs: -L${libdir} -lwebpdemux
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -0,0 +1,72 @@
+noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
+noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
+
+if BUILD_LIBWEBPDECODER
+  noinst_LTLIBRARIES += libwebpdspdecode.la
+endif
+
+common_HEADERS = ../webp/types.h
+commondir = $(includedir)/webp
+
+COMMON_SOURCES =
+COMMON_SOURCES += alpha_processing.c
+COMMON_SOURCES += cpu.c
+COMMON_SOURCES += dec.c
+COMMON_SOURCES += dec_clip_tables.c
+COMMON_SOURCES += dec_mips32.c
+COMMON_SOURCES += dec_neon.c
+COMMON_SOURCES += dsp.h
+COMMON_SOURCES += lossless.c
+COMMON_SOURCES += lossless.h
+COMMON_SOURCES += lossless_mips32.c
+COMMON_SOURCES += lossless_neon.c
+COMMON_SOURCES += neon.h
+COMMON_SOURCES += upsampling.c
+COMMON_SOURCES += upsampling_neon.c
+COMMON_SOURCES += yuv.c
+COMMON_SOURCES += yuv.h
+COMMON_SOURCES += yuv_mips32.c
+
+ENC_SOURCES =
+ENC_SOURCES += enc.c
+ENC_SOURCES += enc_mips32.c
+ENC_SOURCES += enc_neon.c
+
+libwebpdsp_avx2_la_SOURCES =
+libwebpdsp_avx2_la_SOURCES += enc_avx2.c
+libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
+
+libwebpdspdecode_sse2_la_SOURCES =
+libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += lossless_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += upsampling_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += yuv_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += yuv_tables_sse2.h
+libwebpdspdecode_sse2_la_CPPFLAGS = $(libwebpdsp_sse2_la_CPPFLAGS)
+libwebpdspdecode_sse2_la_CFLAGS = $(libwebpdsp_sse2_la_CFLAGS)
+
+libwebpdsp_sse2_la_SOURCES =
+libwebpdsp_sse2_la_SOURCES += enc_sse2.c
+libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
+libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la
+
+libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)
+
+noinst_HEADERS =
+noinst_HEADERS += ../dec/decode_vp8.h
+noinst_HEADERS += ../webp/decode.h
+
+libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
+libwebpdsp_la_LDFLAGS = -lm
+libwebpdsp_la_LIBADD = libwebpdsp_avx2.la libwebpdsp_sse2.la
+
+if BUILD_LIBWEBPDECODER
+  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
+
+  libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
+  libwebpdspdecode_la_LIBADD = libwebpdspdecode_sse2.la
+endif
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -0,0 +1,329 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include "./dsp.h"
+
+// Tables can be faster on some platform but incur some extra binary size (~2k).
+// #define USE_TABLES_FOR_ALPHA_MULT
+
+// -----------------------------------------------------------------------------
+
+#define MFIX 24    // 24bit fixed-point arithmetic
+#define HALF ((1u << MFIX) >> 1)
+#define KINV_255 ((1u << MFIX) / 255u)
+
+static uint32_t Mult(uint8_t x, uint32_t mult) {
+  const uint32_t v = (x * mult + HALF) >> MFIX;
+  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
+  return v;
+}
+
+#ifdef USE_TABLES_FOR_ALPHA_MULT
+
+static const uint32_t kMultTables[2][256] = {
+  {    // (255u << MFIX) / alpha
+    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
+    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
+    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
+    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
+    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
+    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
+    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
+    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
+    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
+    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
+    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
+    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
+    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
+    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
+    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
+    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
+    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
+    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
+    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
+    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
+    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
+    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
+    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
+    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
+    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
+    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
+    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
+    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
+    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
+    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
+    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
+    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
+    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
+    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
+    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
+    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
+    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
+    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
+    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
+    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
+    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
+    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
+    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
+  {   // alpha * KINV_255
+    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
+    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
+    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
+    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
+    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
+    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
+    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
+    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
+    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
+    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
+    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
+    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
+    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
+    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
+    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
+    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
+    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
+    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
+    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
+    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
+    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
+    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
+    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
+    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
+    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
+    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
+    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
+    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
+    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
+    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
+    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
+    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
+    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
+    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
+    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
+    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
+    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
+    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
+    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
+    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
+    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
+    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
+    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
+};
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
+  return kMultTables[!inverse][a];
+}
+
+#else
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
+  return inverse ? (255u << MFIX) / a : a * KINV_255;
+}
+
+#endif    // USE_TABLES_FOR_ALPHA_MULT
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        const uint32_t alpha = (argb >> 24) & 0xff;
+        const uint32_t scale = GetScale(alpha, inverse);
+        uint32_t out = argb & 0xff000000u;
+        out |= Mult(argb >>  0, scale) <<  0;
+        out |= Mult(argb >>  8, scale) <<  8;
+        out |= Mult(argb >> 16, scale) << 16;
+        ptr[x] = out;
+      }
+    }
+  }
+}
+
+static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    const uint32_t a = alpha[x];
+    if (a != 255) {
+      if (a == 0) {
+        ptr[x] = 0;
+      } else {
+        const uint32_t scale = GetScale(a, inverse);
+        ptr[x] = Mult(ptr[x], scale);
+      }
+    }
+  }
+}
+
+#undef KINV_255
+#undef HALF
+#undef MFIX
+
+void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse);
+
+//------------------------------------------------------------------------------
+// Generic per-plane calls
+
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
+    ptr += stride;
+  }
+}
+
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultRow(ptr, alpha, width, inverse);
+    ptr += stride;
+    alpha += alpha_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Premultiplied modes
+
+// non dithered-modes
+
+// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
+// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
+// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
+#if 1     // (int)(x * a / 255.)
+#define MULTIPLIER(a)   ((a) * 32897U)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+#else     // (int)(x * a / 255. + .5)
+#define MULTIPLIER(a) ((a) * 65793U)
+#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
+#endif
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  while (h-- > 0) {
+    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// rgbA4444
+
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
+
+static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
+  return (x & 0xf0) | (x >> 4);
+}
+
+static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
+  return (x & 0x0f) | (x << 4);
+}
+
+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
+static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
+                                               int w, int h, int stride,
+                                               int rg_byte_pos /* 0 or 1 */) {
+  while (h-- > 0) {
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
+      const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
+      const uint8_t a = ba & 0x0f;
+      const uint32_t mult = MULTIPLIER(a);
+      const uint8_t r = multiply(dither_hi(rg), mult);
+      const uint8_t g = multiply(dither_lo(rg), mult);
+      const uint8_t b = multiply(dither_hi(ba), mult);
+      rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
+      rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
+    }
+    rgba4444 += stride;
+  }
+}
+#undef MULTIPLIER
+
+static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
+                                   int w, int h, int stride) {
+#ifdef WEBP_SWAP_16BIT_CSP
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+#else
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+#endif
+}
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  uint8_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint8_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  return (alpha_mask == 0xff);
+}
+
+void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
+void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+void WebPInitAlphaProcessing(void) {
+  WebPMultARGBRow = MultARGBRow;
+  WebPMultRow = MultRow;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPExtractAlpha = ExtractAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitAlphaProcessingSSE2();
+    }
+#endif
+  }
+}
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 8) {
+      // load 32 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i b0 = _mm_and_si128(a0, a_mask);
+      const __m128i b1 = _mm_and_si128(a1, a_mask);
+      const __m128i c0 = _mm_packs_epi32(b0, b1);
+      const __m128i d0 = _mm_packus_epi16(c0, c0);
+      // store
+      _mm_storel_epi64((__m128i*)&alpha[i], d0);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xff);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+void WebPInitAlphaProcessingSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPExtractAlpha = ExtractAlpha;
+#endif
+}
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -0,0 +1,130 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "./dsp.h"
+
+#if defined(__ANDROID__)
+#include <cpu-features.h>
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
+#elif defined(WEBP_MSC_SSE2)
+#define GetCPUInfo __cpuid
+#endif
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+static int x86CPUInfo(CPUFeature feature) {
+  int cpu_info[4];
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return 0 != (cpu_info[3] & 0x04000000);
+  }
+  if (feature == kSSE3) {
+    return 0 != (cpu_info[2] & 0x00000001);
+  }
+  if (feature == kAVX) {
+    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
+      // XMM state and YMM state enabled by the OS.
+      return (xgetbv() & 0x6) == 0x6;
+    }
+  }
+  if (feature == kAVX2) {
+    if (x86CPUInfo(kAVX)) {
+      GetCPUInfo(cpu_info, 7);
+      return ((cpu_info[1] & 0x00000020) == 0x00000020);
+    }
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
+static int AndroidCPUInfo(CPUFeature feature) {
+  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+  const uint64_t cpu_features = android_getCpuFeatures();
+  if (feature == kNEON) {
+    return (cpu_family == ANDROID_CPU_FAMILY_ARM &&
+            0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON));
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(WEBP_USE_NEON)
+// define a dummy function to enable turning off NEON at runtime by setting
+// VP8DecGetCPUInfo = NULL
+static int armCPUInfo(CPUFeature feature) {
+  (void)feature;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32)
+static int mipsCPUInfo(CPUFeature feature) {
+  (void)feature;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
+
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -1,72 +1,44 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// speed-critical functions.
+// Speed-critical decoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "vp8i.h"
+#include "./dsp.h"
+#include "../dec/vp8i.h"

-#if defined(__SSE2__)
-#include <emmintrin.h>
-#endif
+//------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// run-time tables (~4k)
-
-static uint8_t abs0[255 + 255 + 1];     // abs(i)
-static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
-static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
-static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
-static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
-
-// We declare this variable 'volatile' to prevent instruction reordering
-// and make sure it's set to true _last_ (so as to be thread-safe)
-static volatile int tables_ok = 0;
-
-void VP8DspInitTables(void) {
-  if (!tables_ok) {
-    int i;
-    for (i = -255; i <= 255; ++i) {
-      abs0[255 + i] = (i < 0) ? -i : i;
-      abs1[255 + i] = abs0[255 + i] >> 1;
-    }
-    for (i = -1020; i <= 1020; ++i) {
-      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
-    }
-    for (i = -112; i <= 112; ++i) {
-      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
-    }
-    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
-    }
-    tables_ok = 1;
-  }
-}
-
-static inline uint8_t clip_8b(int v) {
+static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

 #define STORE(x, y, v) \
  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))

+#define STORE2(y, dc, d, c) do {    \
+  const int DC = (dc);              \
+  STORE(0, y, DC + (d));            \
+  STORE(1, y, DC + (c));            \
+  STORE(2, y, DC - (c));            \
+  STORE(3, y, DC - (d));            \
+} while (0)
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)

-static void Transform(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@ -104,13 +76,32 @@ static void Transform(const int16_t* in, uint8_t* dst) {
    dst += BPS;
  }
 }
+
+// Simplified transform when only in[0], in[1] and in[4] are non-zero
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  STORE2(0, a + d4, d1, c1);
+  STORE2(1, a + c4, d1, c1);
+  STORE2(2, a - c4, d1, c1);
+  STORE2(3, a - d4, d1, c1);
+}
 #undef MUL
+#undef STORE2
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}

 static void TransformUV(const int16_t* in, uint8_t* dst) {
-  Transform(in + 0 * 16, dst);
-  Transform(in + 1 * 16, dst + 4);
-  Transform(in + 2 * 16, dst + 4 * BPS);
-  Transform(in + 3 * 16, dst + 4 * BPS + 4);
+  VP8Transform(in + 0 * 16, dst, 1);
+  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

 static void TransformDC(const int16_t *in, uint8_t* dst) {
@ -124,21 +115,15 @@ static void TransformDC(const int16_t *in, uint8_t* dst) {
 }

 static void TransformDCUV(const int16_t* in, uint8_t* dst) {
-  if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
-  if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
-  if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
-  if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
+  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
+  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
+  if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
 }

 #undef STORE

-// default C implementations:
-VP8Idct VP8Transform = Transform;
-VP8Idct VP8TransformUV = TransformUV;
-VP8Idct VP8TransformDC = TransformDC;
-VP8Idct VP8TransformDCUV = TransformDCUV;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 14.3

 static void TransformWHT(const int16_t* in, int16_t* out) {
@ -168,16 +153,16 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
  }
 }

-void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra predictions

-#define OUT(x, y) dst[(x) + (y) * BPS]
+#define DST(x, y) dst[(x) + (y) * BPS]

-static inline void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
  const uint8_t* top = dst - BPS;
-  const uint8_t* const clip0 = clip1 + 255 - top[-1];
+  const uint8_t* const clip0 = VP8kclip1 - top[-1];
  int y;
  for (y = 0; y < size; ++y) {
    const uint8_t* const clip = clip0 + dst[-1];
@ -192,7 +177,7 @@ static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
 static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
 static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // 16x16

 static void VE16(uint8_t *dst) {     // vertical
@ -210,7 +195,7 @@ static void HE16(uint8_t *dst) {     // horizontal
  }
 }

-static inline void Put16(int v, uint8_t* dst) {
+static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  int j;
  for (j = 0; j < 16; ++j) {
    memset(dst + j * BPS, v, 16);
@ -248,7 +233,7 @@ static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // 4x4

 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@ -298,13 +283,13 @@ static void RD4(uint8_t *dst) {   // Down-right
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
-  OUT(0, 3)                                     = AVG3(J, K, L);
-  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
-  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
-  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
-  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
-  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
-  OUT(3, 0)                                     = AVG3(D, C, B);
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
 }

 static void LD4(uint8_t *dst) {   // Down-Left
@ -316,13 +301,13 @@ static void LD4(uint8_t *dst) {   // Down-Left
  const int F = dst[5 - BPS];
  const int G = dst[6 - BPS];
  const int H = dst[7 - BPS];
-  OUT(0, 0)                                     = AVG3(A, B, C);
-  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
-  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
-  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
-  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
-  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
-  OUT(3, 3)                                     = AVG3(G, H, H);
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
 }

 static void VR4(uint8_t *dst) {   // Vertical-Right
@ -334,17 +319,17 @@ static void VR4(uint8_t *dst) {   // Vertical-Right
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
-  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
-  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
-  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
-  OUT(3, 0)             = AVG2(C, D);
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);

-  OUT(0, 3) =             AVG3(K, J, I);
-  OUT(0, 2) =             AVG3(J, I, X);
-  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
-  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
-  OUT(3, 1) =             AVG3(B, C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
 }

 static void VL4(uint8_t *dst) {   // Vertical-Left
@ -356,17 +341,17 @@ static void VL4(uint8_t *dst) {   // Vertical-Left
  const int F = dst[5 - BPS];
  const int G = dst[6 - BPS];
  const int H = dst[7 - BPS];
-  OUT(0, 0) =             AVG2(A, B);
-  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
-  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
-  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);

-  OUT(0, 1) =             AVG3(A, B, C);
-  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
-  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
-  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
-              OUT(3, 2) = AVG3(E, F, G);
-              OUT(3, 3) = AVG3(F, G, H);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
 }

 static void HU4(uint8_t *dst) {   // Horizontal-Up
@ -374,14 +359,14 @@ static void HU4(uint8_t *dst) {   // Horizontal-Up
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
  const int L = dst[-1 + 3 * BPS];
-  OUT(0, 0) =             AVG2(I, J);
-  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
-  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
-  OUT(1, 0) =             AVG3(I, J, K);
-  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
-  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
-  OUT(3, 2) = OUT(2, 2) =
-    OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

 static void HD4(uint8_t *dst) {  // Horizontal-Down
@ -394,23 +379,24 @@ static void HD4(uint8_t *dst) {  // Horizontal-Down
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];

-  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
-  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
-  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
-  OUT(0, 3)             = AVG2(L, K);
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);

-  OUT(3, 0)             = AVG3(A, B, C);
-  OUT(2, 0)             = AVG3(X, A, B);
-  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
-  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
-  OUT(1, 3)             = AVG3(L, K, J);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
 }

+#undef DST
 #undef AVG3
 #undef AVG2

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Chroma

 static void VE8uv(uint8_t *dst) {    // vertical
@ -429,10 +415,10 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 }

 // helper for chroma-DC predictions
-static inline void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  int j;
  for (j = 0; j < 8; ++j) {
-    *(uint64_t*)(dst + j * BPS) = v;
+    memset(dst + j * BPS, value, 8);
  }
 }

@ -442,7 +428,7 @@ static void DC8uv(uint8_t *dst) {     // DC
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 4, dst);
 }

 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@ -451,7 +437,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@ -460,99 +446,102 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x8080808080808080ULL, dst);
+  Put8x8uv(0x80, dst);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // default C implementations

-VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
+const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
 };

-VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
+const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
  DC16, TM16, VE16, HE16,
  DC16NoTop, DC16NoLeft, DC16NoTopLeft
 };

-VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
+const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
  DC8uv, TM8uv, VE8uv, HE8uv,
  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Edge filtering functions

 // 4 pixels in, 2 pixels out
-static inline void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
-  p[-step] = clip1[255 + p0 + a2];
-  p[    0] = clip1[255 + q0 - a1];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
+  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
 }

 // 4 pixels in, 4 pixels out
-static inline void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
  const int a3 = (a1 + 1) >> 1;
-  p[-2*step] = clip1[255 + p1 + a3];
-  p[-  step] = clip1[255 + p0 + a2];
-  p[      0] = clip1[255 + q0 - a1];
-  p[   step] = clip1[255 + q1 - a3];
+  p[-2*step] = VP8kclip1[p1 + a3];
+  p[-  step] = VP8kclip1[p0 + a2];
+  p[      0] = VP8kclip1[q0 - a1];
+  p[   step] = VP8kclip1[q1 - a3];
 }

 // 6 pixels in, 6 pixels out
-static inline void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
-  const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
-  p[-3*step] = clip1[255 + p2 + a3];
-  p[-2*step] = clip1[255 + p1 + a2];
-  p[-  step] = clip1[255 + p0 + a1];
-  p[      0] = clip1[255 + q0 - a1];
-  p[   step] = clip1[255 + q1 - a2];
-  p[ 2*step] = clip1[255 + q2 - a3];
+  p[-3*step] = VP8kclip1[p2 + a3];
+  p[-2*step] = VP8kclip1[p1 + a2];
+  p[-  step] = VP8kclip1[p0 + a1];
+  p[      0] = VP8kclip1[q0 - a1];
+  p[   step] = VP8kclip1[q1 - a2];
+  p[ 2*step] = VP8kclip1[q2 - a3];
 }

-static inline int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
+  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }

-static inline int needs_filter(const uint8_t* p, int step, int thresh) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }

-static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
-  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
-  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
-  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
-    return 0;
-  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
-         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
-         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
+  const int p0 = p[-step], q0 = p[0];
+  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
+  return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
+         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
+         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
+  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
+    if (needs_filter(p + i, stride, thresh2)) {
      do_filter2(p + i, stride);
    }
  }
@ -560,8 +549,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {

 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
+  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
      do_filter2(p + i * stride, 1);
    }
  }
@ -583,13 +573,15 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

-static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -600,10 +592,12 @@ static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
  }
 }

-static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -669,28 +663,69 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

-void (*VP8VFilter16)(uint8_t*, int, int, int, int) = VFilter16;
-void (*VP8HFilter16)(uint8_t*, int, int, int, int) = HFilter16;
-void (*VP8VFilter8)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8;
-void (*VP8HFilter8)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8;
-void (*VP8VFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
-void (*VP8HFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
-void (*VP8VFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
-void (*VP8HFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
+VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformAC3;
+VP8DecIdct VP8TransformUV;
+VP8DecIdct VP8TransformDC;
+VP8DecIdct VP8TransformDCUV;

-void (*VP8SimpleVFilter16)(uint8_t*, int, int) = SimpleVFilter16;
-void (*VP8SimpleHFilter16)(uint8_t*, int, int) = SimpleHFilter16;
-void (*VP8SimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
-void (*VP8SimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
+VP8LumaFilterFunc VP8VFilter16;
+VP8LumaFilterFunc VP8HFilter16;
+VP8ChromaFilterFunc VP8VFilter8;
+VP8ChromaFilterFunc VP8HFilter8;
+VP8LumaFilterFunc VP8VFilter16i;
+VP8LumaFilterFunc VP8HFilter16i;
+VP8ChromaFilterFunc VP8VFilter8i;
+VP8ChromaFilterFunc VP8HFilter8i;
+VP8SimpleFilterFunc VP8SimpleVFilter16;
+VP8SimpleFilterFunc VP8SimpleHFilter16;
+VP8SimpleFilterFunc VP8SimpleVFilter16i;
+VP8SimpleFilterFunc VP8SimpleHFilter16i;

-//-----------------------------------------------------------------------------
+extern void VP8DspInitSSE2(void);
+extern void VP8DspInitNEON(void);
+extern void VP8DspInitMIPS32(void);

 void VP8DspInit(void) {
-  // later we'll plug some SSE2 variant here
+  VP8InitClipTables();
+
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformUV = TransformUV;
+  VP8TransformDC = TransformDC;
+  VP8TransformDCUV = TransformDCUV;
+  VP8TransformAC3 = TransformAC3;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8DspInitSSE2();
+    }
+#elif defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8DspInitNEON();
+    }
+#elif defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8DspInitMIPS32();
+    }
+#endif
+  }
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dsp/dec_clip_tables.c
+++ b/src/dsp/dec_clip_tables.c
@ -0,0 +1,366 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Clipping tables for filtering
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#define USE_STATIC_TABLES     // undefine to have run-time table initialization
+
+#ifdef USE_STATIC_TABLES
+
+static const uint8_t abs0[255 + 255 + 1] = {
+  0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
+  0xf3, 0xf2, 0xf1, 0xf0, 0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8,
+  0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde, 0xdd, 0xdc,
+  0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0,
+  0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
+  0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+  0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac,
+  0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0,
+  0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97, 0x96, 0x95, 0x94,
+  0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88,
+  0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c,
+  0x7b, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
+  0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64,
+  0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58,
+  0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c,
+  0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
+  0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
+  0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28,
+  0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c,
+  0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
+  0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
+  0x03, 0x02, 0x01, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
+  0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+  0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+  0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+  0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+  0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+  0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+  0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+  0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
+  0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+  0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+  0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+  0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
+  0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
+  0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
+  0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
+  0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
+  0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+  0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static const int8_t sclip1[1020 + 1020 + 1] = {
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+  0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93,
+  0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
+  0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+  0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3,
+  0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
+  0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+  0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3,
+  0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+  0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+  0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
+  0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
+  0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
+  0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
+};
+
+static const int8_t sclip2[112 + 112 + 1] = {
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
+  0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+};
+
+static const uint8_t clip1[255 + 511 + 1] = {
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
+  0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+  0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+  0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+  0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+  0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+  0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+  0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+  0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
+  0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+  0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+  0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+  0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
+  0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
+  0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
+  0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
+  0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
+  0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+  0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+#else
+
+// uninitialized tables
+static uint8_t abs0[255 + 255 + 1];
+static int8_t sclip1[1020 + 1020 + 1];
+static int8_t sclip2[112 + 112 + 1];
+static uint8_t clip1[255 + 511 + 1];
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+#endif
+
+const int8_t* const VP8ksclip1 = &sclip1[1020];
+const int8_t* const VP8ksclip2 = &sclip2[112];
+const uint8_t* const VP8kclip1 = &clip1[255];
+const uint8_t* const VP8kabs0 = &abs0[255];
+
+void VP8InitClipTables(void) {
+#if !defined(USE_STATIC_TABLES)
+  int i;
+  if (!tables_ok) {
+    for (i = -255; i <= 255; ++i) {
+      abs0[255 + i] = (i < 0) ? -i : i;
+    }
+    for (i = -1020; i <= 1020; ++i) {
+      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
+    }
+    for (i = -112; i <= 112; ++i) {
+      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
+    }
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+    }
+    tables_ok = 1;
+  }
+#endif    // USE_STATIC_TABLES
+}
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -0,0 +1,578 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+static WEBP_INLINE int abs_mips32(int x) {
+  const int sign = x >> 31;
+  return (x ^ sign) - sign;
+}
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2 * step] = VP8kclip1[p1 + a3];
+  p[-    step] = VP8kclip1[p0 + a2];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3 * step] = VP8kclip1[p2 + a3];
+  p[-2 * step] = VP8kclip1[p1 + a2];
+  p[-    step] = VP8kclip1[p0 + a1];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a2];
+  p[ 2 * step] = VP8kclip1[q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) <= thresh);
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step];
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) > t) {
+    return 0;
+  }
+  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
+         abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
+         abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14;
+  int temp15, temp16, temp17, temp18;
+  int16_t* p_in = (int16_t*)in;
+
+  // loops unrolled and merged to avoid usage of tmp buffer
+  // and to reduce number of stalls. MUL macro is written
+  // in assembler and inlined
+  __asm__ volatile(
+    "lh       %[temp0],  0(%[in])                      \n\t"
+    "lh       %[temp8],  16(%[in])                     \n\t"
+    "lh       %[temp4],  8(%[in])                      \n\t"
+    "lh       %[temp12], 24(%[in])                     \n\t"
+    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
+    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
+    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
+    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
+    "lh       %[temp1],  2(%[in])                      \n\t"
+    "lh       %[temp5],  10(%[in])                     \n\t"
+    "lh       %[temp9],  18(%[in])                     \n\t"
+    "lh       %[temp13], 26(%[in])                     \n\t"
+    "sra      %[temp8],  %[temp8],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp4],  %[temp4],  16                 \n\t"
+    "sra      %[temp12], %[temp12], 16                 \n\t"
+    "lh       %[temp2],  4(%[in])                      \n\t"
+    "lh       %[temp6],  12(%[in])                     \n\t"
+    "lh       %[temp10], 20(%[in])                     \n\t"
+    "lh       %[temp14], 28(%[in])                     \n\t"
+    "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
+    "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
+    "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
+    "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
+    "lh       %[temp3],  6(%[in])                      \n\t"
+    "lh       %[temp7],  14(%[in])                     \n\t"
+    "lh       %[temp11], 22(%[in])                     \n\t"
+    "lh       %[temp15], 30(%[in])                     \n\t"
+    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
+    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
+    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
+    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
+    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
+    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
+    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
+    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
+    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
+    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
+    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
+    "addiu    %[temp8],  %[temp8],  4                  \n\t"
+    "addiu    %[temp12], %[temp12], 4                  \n\t"
+    "addiu    %[temp0],  %[temp0],  4                  \n\t"
+    "addiu    %[temp4],  %[temp4],  4                  \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
+    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
+    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
+    "addu     %[temp11], %[temp16], %[temp7]           \n\t"
+    "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
+    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
+    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
+    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
+    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
+    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
+    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
+    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
+    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
+    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
+    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
+    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
+    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
+    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
+    "sra      %[temp2],  %[temp2],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp3],  %[temp3],  16                 \n\t"
+    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
+    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
+    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
+    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
+    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
+    "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
+    "subu     %[temp16], %[temp16], %[temp11]          \n\t"
+    "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  3                  \n\t"
+    "sra      %[temp16], %[temp16], 3                  \n\t"
+    "sra      %[temp11], %[temp11], 3                  \n\t"
+    "sra      %[temp8],  %[temp8],  3                  \n\t"
+    "addu     %[temp17], %[temp10], %[temp15]          \n\t"
+    "subu     %[temp10], %[temp10], %[temp15]          \n\t"
+    "addu     %[temp15], %[temp12], %[temp9]           \n\t"
+    "subu     %[temp12], %[temp12], %[temp9]           \n\t"
+    "sra      %[temp17], %[temp17], 3                  \n\t"
+    "sra      %[temp10], %[temp10], 3                  \n\t"
+    "sra      %[temp15], %[temp15], 3                  \n\t"
+    "sra      %[temp12], %[temp12], 3                  \n\t"
+    "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
+    "subu     %[temp14], %[temp14], %[temp3]           \n\t"
+    "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
+    "sra      %[temp9],  %[temp9],  3                  \n\t"
+    "sra      %[temp14], %[temp14], 3                  \n\t"
+    "sra      %[temp3],  %[temp3],  3                  \n\t"
+    "sra      %[temp0],  %[temp0],  3                  \n\t"
+    "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
+    "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
+    "sra      %[temp13], %[temp13], 3                  \n\t"
+    "sra      %[temp2],  %[temp2],  3                  \n\t"
+    "sra      %[temp7],  %[temp7],  3                  \n\t"
+    "sra      %[temp4],  %[temp4],  3                  \n\t"
+    "addiu    %[temp6],  $zero,     255                \n\t"
+    "lbu      %[temp1],  0(%[dst])                     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
+    "sra      %[temp5],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp5],  1f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "1:                                                  \n\t"
+    "lbu      %[temp18], 1(%[dst])                     \n\t"
+    "sb       %[temp1],  0(%[dst])                     \n\t"
+    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
+    "sra      %[temp11], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp11], 2f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "2:                                                  \n\t"
+    "lbu      %[temp1],  2(%[dst])                     \n\t"
+    "sb       %[temp18], 1(%[dst])                     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
+    "sra      %[temp8],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp8],  3f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "3:                                                  \n\t"
+    "lbu      %[temp18], 3(%[dst])                     \n\t"
+    "sb       %[temp1],  2(%[dst])                     \n\t"
+    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
+    "sra      %[temp16], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp16], 4f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "4:                                                  \n\t"
+    "sb       %[temp18], 3(%[dst])                     \n\t"
+    "lbu      %[temp5],  32(%[dst])                    \n\t"
+    "lbu      %[temp8],  33(%[dst])                    \n\t"
+    "lbu      %[temp11], 34(%[dst])                    \n\t"
+    "lbu      %[temp16], 35(%[dst])                    \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
+    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
+    "addu     %[temp16], %[temp16], %[temp10]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "beqz     %[temp18], 5f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "5:                                                  \n\t"
+    "sra      %[temp18], %[temp8],  8                  \n\t"
+    "sra      %[temp1],  %[temp8],  31                 \n\t"
+    "beqz     %[temp18], 6f                            \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
+  "6:                                                  \n\t"
+    "sra      %[temp18], %[temp11], 8                  \n\t"
+    "sra      %[temp1],  %[temp11], 31                 \n\t"
+    "sra      %[temp17], %[temp16], 8                  \n\t"
+    "sra      %[temp15], %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 7f                            \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
+  "7:                                                  \n\t"
+    "beqz     %[temp17], 8f                            \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
+  "8:                                                  \n\t"
+    "sb       %[temp5],  32(%[dst])                    \n\t"
+    "sb       %[temp8],  33(%[dst])                    \n\t"
+    "sb       %[temp11], 34(%[dst])                    \n\t"
+    "sb       %[temp16], 35(%[dst])                    \n\t"
+    "lbu      %[temp5],  64(%[dst])                    \n\t"
+    "lbu      %[temp8],  65(%[dst])                    \n\t"
+    "lbu      %[temp11], 66(%[dst])                    \n\t"
+    "lbu      %[temp16], 67(%[dst])                    \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp14]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 9f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "9:                                                  \n\t"
+    "beqz     %[temp17], 10f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "10:                                                 \n\t"
+    "beqz     %[temp12], 11f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "11:                                                 \n\t"
+    "beqz     %[temp9],  12f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "12:                                                 \n\t"
+    "sb       %[temp5],  64(%[dst])                    \n\t"
+    "sb       %[temp8],  65(%[dst])                    \n\t"
+    "sb       %[temp11], 66(%[dst])                    \n\t"
+    "sb       %[temp16], 67(%[dst])                    \n\t"
+    "lbu      %[temp5],  96(%[dst])                    \n\t"
+    "lbu      %[temp8],  97(%[dst])                    \n\t"
+    "lbu      %[temp11], 98(%[dst])                    \n\t"
+    "lbu      %[temp16], 99(%[dst])                    \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp2]           \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 13f                           \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "13:                                                 \n\t"
+    "beqz     %[temp17], 14f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "14:                                                 \n\t"
+    "beqz     %[temp12], 15f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "15:                                                 \n\t"
+    "beqz     %[temp9],  16f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "16:                                                 \n\t"
+    "sb       %[temp5],  96(%[dst])                    \n\t"
+    "sb       %[temp8],  97(%[dst])                    \n\t"
+    "sb       %[temp11], 98(%[dst])                    \n\t"
+    "sb       %[temp16], 99(%[dst])                    \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18)
+    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPS32(void);
+
+void VP8DspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8InitClipTables();
+
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+#endif  // WEBP_USE_MIPS32
+}
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -0,0 +1,978 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of some decoding functions (idct, loop filtering).
+//
+// Author: somnath@google.com (Somnath Banerjee)
+//         cduvivier@google.com (Christian Duvivier)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
+// one it seems => disable it by default. Uncomment the following to enable:
+// #define USE_TRANSFORM_AC3
+
+#include <emmintrin.h>
+#include "../dec/vp8i.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two transforms
+  // in parallel). In the case of only one transform, the second half of the
+  // vectors will just contain random value we'll never use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Add inverse transform to 'dst' and store.
+  {
+    const __m128i zero = _mm_setzero_si128();
+    // Load the reference(s).
+    __m128i dst0, dst1, dst2, dst3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
+      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
+      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
+      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
+    } else {
+      // Load four bytes/pixels per line.
+      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+    }
+    // Convert to 16b.
+    dst0 = _mm_unpacklo_epi8(dst0, zero);
+    dst1 = _mm_unpacklo_epi8(dst1, zero);
+    dst2 = _mm_unpacklo_epi8(dst2, zero);
+    dst3 = _mm_unpacklo_epi8(dst3, zero);
+    // Add the inverse transform(s).
+    dst0 = _mm_add_epi16(dst0, T0);
+    dst1 = _mm_add_epi16(dst1, T1);
+    dst2 = _mm_add_epi16(dst2, T2);
+    dst3 = _mm_add_epi16(dst3, T3);
+    // Unsigned saturate to 8b.
+    dst0 = _mm_packus_epi16(dst0, dst0);
+    dst1 = _mm_packus_epi16(dst1, dst1);
+    dst2 = _mm_packus_epi16(dst2, dst2);
+    dst3 = _mm_packus_epi16(dst3, dst3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
+      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
+      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
+      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
+    } else {
+      // Store four bytes/pixels per line.
+      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+    }
+  }
+}
+
+#if defined(USE_TRANSFORM_AC3)
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  static const int kC1 = 20091 + (1 << 16);
+  static const int kC2 = 35468;
+  const __m128i A = _mm_set1_epi16(in[0] + 4);
+  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
+  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
+  const __m128i B = _mm_adds_epi16(A, CD);
+  const __m128i m0 = _mm_adds_epi16(B, d4);
+  const __m128i m1 = _mm_adds_epi16(B, c4);
+  const __m128i m2 = _mm_subs_epi16(B, c4);
+  const __m128i m3 = _mm_subs_epi16(B, d4);
+  const __m128i zero = _mm_setzero_si128();
+  // Load the source pixels.
+  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  // Convert to 16b.
+  dst0 = _mm_unpacklo_epi8(dst0, zero);
+  dst1 = _mm_unpacklo_epi8(dst1, zero);
+  dst2 = _mm_unpacklo_epi8(dst2, zero);
+  dst3 = _mm_unpacklo_epi8(dst3, zero);
+  // Add the inverse transform.
+  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
+  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
+  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
+  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
+  // Unsigned saturate to 8b.
+  dst0 = _mm_packus_epi16(dst0, dst0);
+  dst1 = _mm_packus_epi16(dst1, dst1);
+  dst2 = _mm_packus_epi16(dst2, dst2);
+  dst3 = _mm_packus_epi16(dst3, dst3);
+  // Store the results.
+  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+}
+#undef MUL
+#endif   // USE_TRANSFORM_AC3
+
+//------------------------------------------------------------------------------
+// Loop Filter (Paragraph 15)
+
+// Compute abs(p - q) = subs(p - q) OR subs(q - p)
+#define MM_ABS(p, q)  _mm_or_si128(                                            \
+    _mm_subs_epu8((q), (p)),                                                   \
+    _mm_subs_epu8((p), (q)))
+
+// Shift each byte of "x" by 3 bits while preserving by the sign bit.
+static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i signs = _mm_cmpgt_epi8(zero, *x);
+  const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
+  const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
+  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
+  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
+  *x = _mm_packs_epi16(lo_1, hi_1);
+}
+
+#define FLIP_SIGN_BIT2(a, b) {                                                 \
+  a = _mm_xor_si128(a, sign_bit);                                              \
+  b = _mm_xor_si128(b, sign_bit);                                              \
+}
+
+#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
+  FLIP_SIGN_BIT2(a, b);                                                        \
+  FLIP_SIGN_BIT2(c, d);                                                        \
+}
+
+// input/output is uint8_t
+static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
+                                  const __m128i* const p0,
+                                  const __m128i* const q0,
+                                  const __m128i* const q1,
+                                  int hev_thresh, __m128i* const not_hev) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t_1 = MM_ABS(*p1, *p0);
+  const __m128i t_2 = MM_ABS(*q1, *q0);
+
+  const __m128i h = _mm_set1_epi8(hev_thresh);
+  const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
+  const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh
+
+  *not_hev = _mm_or_si128(t_3, t_4);
+  *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
+}
+
+// input pixels are int8_t
+static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
+                                     const __m128i* const p0,
+                                     const __m128i* const q0,
+                                     const __m128i* const q1,
+                                     __m128i* const delta) {
+  // beware of addition order, for saturation!
+  const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
+  const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
+  const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
+  const __m128i s2 = _mm_adds_epi8(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
+  const __m128i s3 = _mm_adds_epi8(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
+  *delta = s3;
+}
+
+// input and output are int8_t
+static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
+                                       const __m128i* const fl) {
+  const __m128i k3 = _mm_set1_epi8(3);
+  const __m128i k4 = _mm_set1_epi8(4);
+  __m128i v3 = _mm_adds_epi8(*fl, k3);
+  __m128i v4 = _mm_adds_epi8(*fl, k4);
+
+  SignedShift8b(&v4);                  // v4 >> 3
+  SignedShift8b(&v3);                  // v3 >> 3
+  *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
+  *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
+}
+
+// Updates values of 2 pixels at MB edge during complex filtering.
+// Update operations:
+// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
+// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
+static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
+                                      const __m128i* const a0_lo,
+                                      const __m128i* const a0_hi) {
+  const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
+  const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
+  const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  *pi = _mm_adds_epi8(*pi, delta);
+  *qi = _mm_subs_epi8(*qi, delta);
+  FLIP_SIGN_BIT2(*pi, *qi);
+}
+
+// input pixels are uint8_t
+static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
+                                    const __m128i* const p0,
+                                    const __m128i* const q0,
+                                    const __m128i* const q1,
+                                    int thresh, __m128i* const mask) {
+  const __m128i m_thresh = _mm_set1_epi8(thresh);
+  const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
+  const __m128i kFE = _mm_set1_epi8(0xFE);
+  const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
+  const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2
+
+  const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
+  const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
+  const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2
+
+  const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
+  *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+// Applies filter on 2 pixels (p0 and q0)
+static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
+                                  __m128i* const q0, __m128i* const q1,
+                                  int thresh) {
+  __m128i a, mask;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  // convert p1/q1 to int8_t (for GetBaseDelta)
+  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
+  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
+
+  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+
+  FLIP_SIGN_BIT2(*p0, *q0);
+  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
+  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
+  DoSimpleFilter(p0, q0, &a);
+  FLIP_SIGN_BIT2(*p0, *q0);
+}
+
+// Applies filter on 4 pixels (p1, p0, q0 and q1)
+static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
+                                  __m128i* const q0, __m128i* const q1,
+                                  const __m128i* const mask, int hev_thresh) {
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i k64 = _mm_set1_epi8(0x40);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i not_hev;
+  __m128i t1, t2, t3;
+
+  // compute hev mask
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+
+  // convert to signed values
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+
+  t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
+  t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
+  t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
+  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
+
+  t2 = _mm_set1_epi8(3);
+  t3 = _mm_set1_epi8(4);
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
+  t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
+  *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
+  FLIP_SIGN_BIT2(*p0, *q0);
+
+  // this is equivalent to signed (a + 1) >> 1 calculation
+  t2 = _mm_add_epi8(t3, sign_bit);
+  t3 = _mm_avg_epu8(t2, zero);
+  t3 = _mm_sub_epi8(t3, k64);
+
+  t3 = _mm_and_si128(not_hev, t3);   // if !hev
+  *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
+  *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
+  FLIP_SIGN_BIT2(*p1, *q1);
+}
+
+// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
+static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
+                                  __m128i* const p0, __m128i* const q0,
+                                  __m128i* const q1, __m128i* const q2,
+                                  const __m128i* const mask, int hev_thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  __m128i a, not_hev;
+
+  // compute hev mask
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p2, *q2);
+  GetBaseDelta(p1, p0, q0, q1, &a);
+
+  { // do simple filter on pixels with hev
+    const __m128i m = _mm_andnot_si128(not_hev, *mask);
+    const __m128i f = _mm_and_si128(a, m);
+    DoSimpleFilter(p0, q0, &f);
+  }
+
+  { // do strong filter on pixels with not hev
+    const __m128i k9 = _mm_set1_epi16(0x0900);
+    const __m128i k63 = _mm_set1_epi16(63);
+
+    const __m128i m = _mm_and_si128(not_hev, *mask);
+    const __m128i f = _mm_and_si128(a, m);
+
+    const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
+    const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
+
+    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9);    // Filter (lo) * 9
+    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9);    // Filter (hi) * 9
+
+    const __m128i a2_lo = _mm_add_epi16(f9_lo, k63);    // Filter * 9 + 63
+    const __m128i a2_hi = _mm_add_epi16(f9_hi, k63);    // Filter * 9 + 63
+
+    const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo);  // Filter * 18 + 63
+    const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi);  // Filter * 18 + 63
+
+    const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
+    const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63
+
+    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
+  }
+}
+
+// reads 8 rows across a vertical edge.
+//
+// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
+// two Load4x4() to avoid code duplication.
+static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
+                                __m128i* const p, __m128i* const q) {
+  __m128i t1, t2;
+
+  // Load 0th, 1st, 4th and 5th rows
+  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
+  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
+  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
+  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50
+
+  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
+  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10
+
+  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+  t1 = _mm_unpacklo_epi8(r0, r1);
+
+  // Load 2nd, 3rd, 6th and 7th rows
+  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
+  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
+  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
+  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70
+
+  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
+  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30
+
+  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+  t2 = _mm_unpacklo_epi8(r0, r1);
+
+  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+  r0 = t1;
+  t1 = _mm_unpacklo_epi16(t1, t2);
+  t2 = _mm_unpackhi_epi16(r0, t2);
+
+  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  *p = _mm_unpacklo_epi32(t1, t2);
+  *q = _mm_unpackhi_epi32(t1, t2);
+}
+
+static WEBP_INLINE void Load16x4(const uint8_t* const r0,
+                                 const uint8_t* const r8,
+                                 int stride,
+                                 __m128i* const p1, __m128i* const p0,
+                                 __m128i* const q0, __m128i* const q1) {
+  __m128i t1, t2;
+  // Assume the pixels around the edge (|) are numbered as follows
+  //                00 01 | 02 03
+  //                10 11 | 12 13
+  //                 ...  |  ...
+  //                e0 e1 | e2 e3
+  //                f0 f1 | f2 f3
+  //
+  // r0 is pointing to the 0th row (00)
+  // r8 is pointing to the 8th row (80)
+
+  // Load
+  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+  Load8x4(r0, stride, p1, q0);
+  Load8x4(r8, stride, p0, q1);
+
+  t1 = *p1;
+  t2 = *q0;
+  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+  *p1 = _mm_unpacklo_epi64(t1, *p0);
+  *p0 = _mm_unpackhi_epi64(t1, *p0);
+  *q0 = _mm_unpacklo_epi64(t2, *q1);
+  *q1 = _mm_unpackhi_epi64(t2, *q1);
+}
+
+static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
+    *x = _mm_srli_si128(*x, 4);
+  }
+}
+
+// Transpose back and store
+static WEBP_INLINE void Store16x4(const __m128i* const p1,
+                                  const __m128i* const p0,
+                                  const __m128i* const q0,
+                                  const __m128i* const q1,
+                                  uint8_t* r0, uint8_t* r8,
+                                  int stride) {
+  __m128i t1, p1_s, p0_s, q0_s, q1_s;
+
+  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+  t1 = *p0;
+  p0_s = _mm_unpacklo_epi8(*p1, t1);
+  p1_s = _mm_unpackhi_epi8(*p1, t1);
+
+  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+  t1 = *q0;
+  q0_s = _mm_unpacklo_epi8(t1, *q1);
+  q1_s = _mm_unpackhi_epi8(t1, *q1);
+
+  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+  t1 = p0_s;
+  p0_s = _mm_unpacklo_epi16(t1, q0_s);
+  q0_s = _mm_unpackhi_epi16(t1, q0_s);
+
+  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+  t1 = p1_s;
+  p1_s = _mm_unpacklo_epi16(t1, q1_s);
+  q1_s = _mm_unpackhi_epi16(t1, q1_s);
+
+  Store4x4(&p0_s, r0, stride);
+  r0 += 4 * stride;
+  Store4x4(&q0_s, r0, stride);
+
+  Store4x4(&p1_s, r8, stride);
+  r8 += 4 * stride;
+  Store4x4(&q1_s, r8, stride);
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  // Load
+  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
+  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
+  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
+  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
+
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+
+  // Store
+  _mm_storeu_si128((__m128i*)&p[-stride], p0);
+  _mm_storeu_si128((__m128i*)&p[0], q0);
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  __m128i p1, p0, q0, q1;
+
+  p -= 2;  // beginning of p1
+
+  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+#define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
+  m = MM_ABS(p1, p0);                                                          \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+} while (0)
+
+#define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+} while (0)
+
+#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
+  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
+  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
+  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
+  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
+}
+
+#define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
+  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
+  const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
+  p = _mm_unpacklo_epi64(U, V);                                                \
+} while (0)
+
+#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
+  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
+  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
+  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
+  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
+}
+
+#define STOREUV(p, u, v, stride) {                                             \
+  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
+  p = _mm_srli_si128(p, 8);                                                    \
+  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
+}
+
+static WEBP_INLINE void ComplexMask(const __m128i* const p1,
+                                    const __m128i* const p0,
+                                    const __m128i* const q0,
+                                    const __m128i* const q1,
+                                    int thresh, int ithresh,
+                                    __m128i* const mask) {
+  const __m128i it = _mm_set1_epi8(ithresh);
+  const __m128i diff = _mm_subs_epu8(*mask, it);
+  const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
+  __m128i filter_mask;
+  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+  *mask = _mm_and_si128(thresh_mask, filter_mask);
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  __m128i t1;
+  __m128i mask;
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
+  _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+  _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+  _mm_storeu_si128((__m128i*)&p[+0 * stride], q0);
+  _mm_storeu_si128((__m128i*)&p[+1 * stride], q1);
+  _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  uint8_t* const b = p - 4;
+  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  MAX_DIFF1(p3, p2, p1, p0, mask);
+
+  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
+  MAX_DIFF2(q3, q2, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  __m128i p3, p2, p1, p0;   // loop invariants
+
+  LOAD_H_EDGES4(p, stride, p3, p2, p1, p0);  // prologue
+
+  for (k = 3; k > 0; --k) {
+    __m128i mask, tmp1, tmp2;
+    uint8_t* const b = p + 2 * stride;   // beginning of p1
+    p += 4 * stride;
+
+    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
+    LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
+    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
+
+    // p3 and p2 are not just temporary variables here: they will be
+    // re-used for next span. And q2/q3 will become p1/p0 accordingly.
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+
+    // Store
+    _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
+    _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
+    _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
+    _mm_storeu_si128((__m128i*)&b[3 * stride], p2);
+
+    // rotate samples
+    p1 = tmp1;
+    p0 = tmp2;
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  __m128i p3, p2, p1, p0;   // loop invariants
+
+  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
+
+  for (k = 3; k > 0; --k) {
+    __m128i mask, tmp1, tmp2;
+    uint8_t* const b = p + 2;   // beginning of p1
+
+    p += 4;  // beginning of q0 (and next span)
+
+    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
+    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
+
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+
+    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+
+    // rotate samples
+    p1 = tmp1;
+    p0 = tmp2;
+  }
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p2, u, v, -3 * stride);
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+  STOREUV(q2, u, v, 2 * stride);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  uint8_t* const tu = u - 4;
+  uint8_t* const tv = v - 4;
+  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  MAX_DIFF1(p3, p2, p1, p0, mask);
+
+  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
+  MAX_DIFF2(q3, q2, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
+  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  // Load p3, p2, p1, p0
+  LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4 * stride;
+  v += 4 * stride;
+
+  // Load q0, q1, q2, q3
+  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4;  // beginning of q0
+  v += 4;
+  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  u -= 2;  // beginning of p1
+  v -= 2;
+  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitSSE2(void);
+
+void VP8DspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8Transform = Transform;
+#if defined(USE_TRANSFORM_AC3)
+  VP8TransformAC3 = TransformAC3;
+#endif
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+#endif   // WEBP_USE_SSE2
+}
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -0,0 +1,293 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   Speed-critical functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_DSP_H_
+#define WEBP_DSP_DSP_H_
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// CPU detection
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
+#define WEBP_USE_AVX2
+#endif
+
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+#define WEBP_ANDROID_NEON  // Android targets that might support NEON
+#endif
+
+#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || defined(__aarch64__)
+#define WEBP_USE_NEON
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#endif
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kAVX,
+  kAVX2,
+  kNEON,
+  kMIPS32
+} CPUFeature;
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+extern VP8CPUInfo VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Encoding
+
+// Transforms
+// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
+//          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
+typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                        int do_two);
+typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
+typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+extern VP8Idct VP8ITransform;
+extern VP8Fdct VP8FTransform;
+extern VP8WHT VP8FTransformWHT;
+// Predictions
+// *dst is the destination block. *top and *left can be NULL.
+typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+                              const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+extern VP8Intra4Preds VP8EncPredLuma4;
+extern VP8IntraPreds VP8EncPredLuma16;
+extern VP8IntraPreds VP8EncPredChroma8;
+
+typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
+typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
+                          const uint16_t* const weights);
+extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
+
+typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+extern VP8BlockCopy VP8Copy4x4;
+// Quantization
+struct VP8Matrix;   // forward declaration
+typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
+                                const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlock VP8EncQuantizeBlock;
+
+// specific to 2nd transform:
+typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
+                                   const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+
+// Collect histogram for susceptibility calculation and accumulate in histo[].
+struct VP8Histogram;
+typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                          int start_block, int end_block,
+                          struct VP8Histogram* const histo);
+extern const int VP8DspScan[16 + 4 + 4];
+extern VP8CHisto VP8CollectHistogram;
+
+void VP8EncDspInit(void);   // must be called before using any of the above
+
+//------------------------------------------------------------------------------
+// Decoding
+
+typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+// when doing two transforms, coeffs is actually int16_t[2][16].
+typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformAC3;
+extern VP8DecIdct VP8TransformUV;
+extern VP8DecIdct VP8TransformDC;
+extern VP8DecIdct VP8TransformDCUV;
+extern VP8WHT VP8TransformWHT;
+
+// *dst is the destination block, with stride BPS. Boundary samples are
+// assumed accessible when needed.
+typedef void (*VP8PredFunc)(uint8_t* dst);
+extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+
+// clipping tables (for filtering)
+extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
+extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
+extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
+extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
+void VP8InitClipTables(void);           // must be called first
+
+// simple filter (only for luma)
+typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
+extern VP8SimpleFilterFunc VP8SimpleVFilter16;
+extern VP8SimpleFilterFunc VP8SimpleHFilter16;
+extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
+extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+// regular filter (on both macroblock edges and inner edges)
+typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
+                                  int thresh, int ithresh, int hev_t);
+typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+                                    int thresh, int ithresh, int hev_t);
+// on outer edge
+extern VP8LumaFilterFunc VP8VFilter16;
+extern VP8LumaFilterFunc VP8HFilter16;
+extern VP8ChromaFilterFunc VP8VFilter8;
+extern VP8ChromaFilterFunc VP8HFilter8;
+
+// on inner edge
+extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
+extern VP8LumaFilterFunc VP8HFilter16i;
+extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
+extern VP8ChromaFilterFunc VP8HFilter8i;
+
+// must be called before anything using the above
+void VP8DspInit(void);
+
+//------------------------------------------------------------------------------
+// WebP I/O
+
+#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
+// bottom_y can be NULL if only one line of output is needed (at top/bottom).
+typedef void (*WebPUpsampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* top_u, const uint8_t* top_v,
+    const uint8_t* cur_u, const uint8_t* cur_v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB(A) modes
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+#endif    // FANCY_UPSAMPLING
+
+// Per-row point-sampling methods.
+typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
+                                   const uint8_t* u, const uint8_t* v,
+                                   uint8_t* dst, int len);
+// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             uint8_t* dst, int dst_stride,
+                             int width, int height, WebPSamplerRowFunc func);
+
+// Sampling functions to convert rows of YUV to RGB(A)
+extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
+
+// General function for converting two lines of ARGB or RGBA.
+// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
+// as 0x00, 0x00, 0x00, 0xff (little endian).
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
+
+// YUV444->RGB converters
+typedef void (*WebPYUV444Converter)(const uint8_t* y,
+                                    const uint8_t* u, const uint8_t* v,
+                                    uint8_t* dst, int len);
+
+extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+
+// Must be called before using the WebPUpsamplers[] (and for premultiplied
+// colorspaces like rgbA, rgbA4444, etc)
+void WebPInitUpsamplers(void);
+// Must be called before using WebPSamplers[]
+void WebPInitSamplers(void);
+
+//------------------------------------------------------------------------------
+// Utilities for processing transparent channel.
+
+// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
+// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
+extern void (*WebPApplyAlphaMultiply)(
+    uint8_t* rgba, int alpha_first, int w, int h, int stride);
+
+// Same, buf specifically for RGBA4444 format
+extern void (*WebPApplyAlphaMultiply4444)(
+    uint8_t* rgba4444, int w, int h, int stride);
+
+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+                               int width, int height,
+                               uint8_t* alpha, int alpha_stride);
+
+// Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
+// Un-Multiply operation transforms x into x * 255 / A.
+
+// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
+extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+
+// Same a WebPMultARGBRow(), but for several rows.
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse);
+
+// Same for a row of single values, with side alpha values.
+extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+                           int width, int inverse);
+
+// Same a WebPMultRow(), but for several 'num_rows' rows.
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse);
+
+// To be called first before using the above.
+void WebPInitAlphaProcessing(void);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_DSP_H_ */
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -1,22 +1,65 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// speed-critical functions.
+// Speed-critical encoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
-#include "vp8enci.h"
+#include <stdlib.h>  // for abs()

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "./dsp.h"
+#include "../enc/vp8enci.h"

-//-----------------------------------------------------------------------------
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int clip_max(int v, int max) {
+  return (v > max) ? max : v;
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+const int VP8DspScan[16 + 4 + 4] = {
+  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
+  for (j = start_block; j < end_block; ++j) {
+    int k;
+    int16_t out[16];
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
+      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+      histo->distribution[clipped_value]++;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
 // run-time tables (~4k)

 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
@ -29,17 +72,14 @@ static void InitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+      clip1[255 + i] = clip_8b(i);
    }
    tables_ok = 1;
  }
 }

-static inline uint8_t clip_8b(int v) {
-  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
-}

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

 #define STORE(x, y, v) \
@ -49,7 +89,8 @@ static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)

-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@ -81,113 +122,90 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) {
  }
 }

+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
-    const int d0 = src[0] - ref[0];
+    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
    const int d1 = src[1] - ref[1];
    const int d2 = src[2] - ref[2];
    const int d3 = src[3] - ref[3];
-    const int a0 = (d0 + d3) << 3;
-    const int a1 = (d1 + d2) << 3;
-    const int a2 = (d1 - d2) << 3;
-    const int a3 = (d0 - d3) << 3;
-    tmp[0 + i * 4] = (a0 + a1);
-    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
-    tmp[2 + i * 4] = (a0 - a1);
-    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  7500) >> 12;
+    const int a0 = (d0 + d3);         // 10b                      [-510,510]
+    const int a1 = (d1 + d2);
+    const int a2 = (d1 - d2);
+    const int a3 = (d0 - d3);
+    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
+    tmp[2 + i * 4] = (a0 - a1) * 8;
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[12 + i]);
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
    const int a3 = (tmp[0 + i] - tmp[12 + i]);
-    out[0 + i] = (a0 + a1 + 7) >> 4;
+    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
    out[8 + i] = (a0 - a1 + 7) >> 4;
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  }
 }

-static void ITransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const int a0 = in[0 + i] + in[12 + i];
-    const int a1 = in[4 + i] + in[ 8 + i];
-    const int a2 = in[4 + i] - in[ 8 + i];
-    const int a3 = in[0 + i] - in[12 + i];
-    tmp[0  + i] = a0 + a1;
-    tmp[8  + i] = a0 - a1;
-    tmp[4  + i] = a3 + a2;
-    tmp[12 + i] = a3 - a2;
-  }
-  for (i = 0; i < 4; ++i) {
-    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
-    const int a0 = dc             + tmp[3 + i * 4];
-    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
-    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
-    const int a3 = dc             - tmp[3 + i * 4];
-    out[ 0] = (a0 + a1) >> 3;
-    out[16] = (a3 + a2) >> 3;
-    out[32] = (a0 - a1) >> 3;
-    out[48] = (a3 - a2) >> 3;
-    out += 64;
-  }
-}
-
 static void FTransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
+  // input is 12b signed
+  int32_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
-    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
-    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
-    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
-    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;   // 14b
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[12+ i]);
    const int a2 = (tmp[4 + i] - tmp[12+ i]);
    const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;
+    const int b0 = a0 + a1;    // 16b
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
-    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
-    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
-    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+    out[ 0 + i] = b0 >> 1;     // 15b
+    out[ 4 + i] = b1 >> 1;
+    out[ 8 + i] = b2 >> 1;
+    out[12 + i] = b3 >> 1;
  }
 }

-// default C implementations:
-VP8Idct VP8ITransform = ITransform;
-VP8Fdct VP8FTransform = FTransform;
-VP8WHT VP8ITransformWHT = ITransformWHT;
-VP8WHT VP8FTransformWHT = FTransformWHT;
-
 #undef MUL
 #undef STORE

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra predictions

-#define OUT(x, y) dst[(x) + (y) * BPS]
+#define DST(x, y) dst[(x) + (y) * BPS]

-static inline void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  int j;
  for (j = 0; j < size; ++j) {
    memset(dst + j * BPS, value, size);
  }
 }

-static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
  int j;
  if (top) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@ -196,7 +214,8 @@ static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
  }
 }

-static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
  if (left) {
    int j;
    for (j = 0; j < size; ++j) {
@ -207,8 +226,8 @@ static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
  }
 }

-static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
  int y;
  if (left) {
    if (top) {
@ -237,9 +256,9 @@ static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
  }
 }

-static inline void DCMode(uint8_t* dst, const uint8_t* left,
-                          const uint8_t* top,
-                          int size, int round, int shift) {
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top,
+                               int size, int round, int shift) {
  int DC = 0;
  int j;
  if (top) {
@ -260,7 +279,7 @@ static inline void DCMode(uint8_t* dst, const uint8_t* left,
  Fill(dst, DC, size);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
@ -280,7 +299,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  TrueMotion(C8TM8 + dst, left, top, 8);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

 static void Intra16Preds(uint8_t* dst,
@ -291,7 +310,7 @@ static void Intra16Preds(uint8_t* dst,
  TrueMotion(I16TM16 + dst, left, top, 16);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // luma 4x4 prediction

 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@ -339,13 +358,13 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
  const int B = top[1];
  const int C = top[2];
  const int D = top[3];
-  OUT(0, 3)                                     = AVG3(J, K, L);
-  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
-  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
-  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
-  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
-  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
-  OUT(3, 0)                                     = AVG3(D, C, B);
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
 }

 static void LD4(uint8_t* dst, const uint8_t* top) {
@ -357,13 +376,13 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
  const int F = top[5];
  const int G = top[6];
  const int H = top[7];
-  OUT(0, 0)                                     = AVG3(A, B, C);
-  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
-  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
-  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
-  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
-  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
-  OUT(3, 3)                                     = AVG3(G, H, H);
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
 }

 static void VR4(uint8_t* dst, const uint8_t* top) {
@ -375,17 +394,17 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
  const int B = top[1];
  const int C = top[2];
  const int D = top[3];
-  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
-  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
-  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
-  OUT(3, 0)             = AVG2(C, D);
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);

-  OUT(0, 3) =             AVG3(K, J, I);
-  OUT(0, 2) =             AVG3(J, I, X);
-  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
-  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
-  OUT(3, 1) =             AVG3(B, C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
 }

 static void VL4(uint8_t* dst, const uint8_t* top) {
@ -397,17 +416,17 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
  const int F = top[5];
  const int G = top[6];
  const int H = top[7];
-  OUT(0, 0) =             AVG2(A, B);
-  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
-  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
-  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);

-  OUT(0, 1) =             AVG3(A, B, C);
-  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
-  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
-  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
-              OUT(3, 2) = AVG3(E, F, G);
-              OUT(3, 3) = AVG3(F, G, H);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
 }

 static void HU4(uint8_t* dst, const uint8_t* top) {
@ -415,14 +434,14 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
-  OUT(0, 0) =             AVG2(I, J);
-  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
-  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
-  OUT(1, 0) =             AVG3(I, J, K);
-  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
-  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
-  OUT(3, 2) = OUT(2, 2) =
-  OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

 static void HD4(uint8_t* dst, const uint8_t* top) {
@ -435,17 +454,17 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
  const int B = top[1];
  const int C = top[2];

-  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
-  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
-  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
-  OUT(0, 3)             = AVG2(L, K);
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);

-  OUT(3, 0)             = AVG3(A, B, C);
-  OUT(2, 0)             = AVG3(X, A, B);
-  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
-  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
-  OUT(1, 3)             = AVG3(L, K, J);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
 }

 static void TM4(uint8_t* dst, const uint8_t* top) {
@ -460,6 +479,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
  }
 }

+#undef DST
 #undef AVG3
 #undef AVG2

@ -478,15 +498,11 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  HU4(I4HU4 + dst, top);
 }

-// default C implementations
-VP8Intra4Preds VP8EncPredLuma4 = Intra4Preds;
-VP8IntraPreds VP8EncPredLuma16 = Intra16Preds;
-VP8IntraPreds VP8EncPredChroma8 = IntraChromaPreds;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Metric

-static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+                              int w, int h) {
  int count = 0;
  int y, x;
  for (y = 0; y < h; ++y) {
@ -513,59 +529,53 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 4, 4);
 }

-// default C implementations
-VP8Metric VP8SSE16x16 = SSE16x16;
-VP8Metric VP8SSE8x8 = SSE8x8;
-VP8Metric VP8SSE16x8 = SSE16x8;
-VP8Metric VP8SSE4x4 = SSE4x4;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Texture distortion
 //
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.

 // Hadamard transform
-static void TTransform(const uint8_t* in, int16_t* out) {
+// Returns the weighted sum of the absolute value of transformed coefficients.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum = 0;
  int tmp[16];
  int i;
+  // horizontal pass
  for (i = 0; i < 4; ++i, in += BPS) {
-    const int a0 = (in[0] + in[2]) << 2;
-    const int a1 = (in[1] + in[3]) << 2;
-    const int a2 = (in[1] - in[3]) << 2;
-    const int a3 = (in[0] - in[2]) << 2;
-    tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
+    const int a0 = in[0] + in[2];
+    const int a1 = in[1] + in[3];
+    const int a2 = in[1] - in[3];
+    const int a3 = in[0] - in[2];
+    tmp[0 + i * 4] = a0 + a1;
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
-  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
-    const int a1 = (tmp[4 + i] + tmp[12+ i]);
-    const int a2 = (tmp[4 + i] - tmp[12+ i]);
-    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+  // vertical pass
+  for (i = 0; i < 4; ++i, ++w) {
+    const int a0 = tmp[0 + i] + tmp[8 + i];
+    const int a1 = tmp[4 + i] + tmp[12+ i];
+    const int a2 = tmp[4 + i] - tmp[12+ i];
+    const int a3 = tmp[0 + i] - tmp[8 + i];
    const int b0 = a0 + a1;
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 < 0) + 3) >> 3;
-    out[ 4 + i] = (b1 + (b1 < 0) + 3) >> 3;
-    out[ 8 + i] = (b2 + (b2 < 0) + 3) >> 3;
-    out[12 + i] = (b3 + (b3 < 0) + 3) >> 3;
+
+    sum += w[ 0] * abs(b0);
+    sum += w[ 4] * abs(b1);
+    sum += w[ 8] * abs(b2);
+    sum += w[12] * abs(b3);
  }
+  return sum;
 }

 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
-  int16_t tmp1[16], tmp2[16];
-  int k;
-  int D;
-  TTransform(a, tmp1);
-  TTransform(b, tmp2);
-  D = 0;
-  for (k = 0; k < 16; ++k)
-    D += w[k] * (abs(tmp2[k]) - abs(tmp1[k]));
-  return (abs(D) + 8) >> 4;
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return abs(sum2 - sum1) >> 5;
 }

 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@ -580,30 +590,33 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
  return D;
 }

-VP8WMetric VP8TDisto4x4 = Disto4x4;
-VP8WMetric VP8TDisto16x16 = Disto16x16;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Quantization
 //

+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
 // Simple quantization
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         int n, const VP8Matrix* const mtx) {
+                         const VP8Matrix* const mtx) {
  int last = -1;
-  for (; n < 16; ++n) {
-    const int j = VP8Zigzag[n];
+  int n;
+  for (n = 0; n < 16; ++n) {
+    const int j = kZigzag[n];
    const int sign = (in[j] < 0);
-    int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    if (coeff > 2047) coeff = 2047;
+    const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
-      if (sign) out[n] = -out[n];
-      in[j] = out[n] * Q;
-      if (out[n]) last = n;
+      const uint32_t Q = mtx->q_[j];
+      const uint32_t iQ = mtx->iq_[j];
+      const uint32_t B = mtx->bias_[j];
+      int level = QUANTDIV(coeff, iQ, B);
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
+      if (sign) level = -level;
+      in[j] = level * Q;
+      out[n] = level;
+      if (level) last = n;
    } else {
      out[n] = 0;
      in[j] = 0;
@ -612,13 +625,36 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

-// default C implementation
-VP8QuantizeBlock VP8EncQuantizeBlock = QuantizeBlock;
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  int n, last = -1;
+  for (n = 0; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    const uint32_t coeff = sign ? -in[j] : in[j];
+    assert(mtx->sharpen_[j] == 0);
+    if (coeff > mtx->zthresh_[j]) {
+      const uint32_t Q = mtx->q_[j];
+      const uint32_t iQ = mtx->iq_[j];
+      const uint32_t B = mtx->bias_[j];
+      int level = QUANTDIV(coeff, iQ, B);
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
+      if (sign) level = -level;
+      in[j] = level * Q;
+      out[n] = level;
+      if (level) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Block copy

-static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
  int y;
  for (y = 0; y < size; ++y) {
    memcpy(dst, src, size);
@ -628,20 +664,78 @@ static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
 }

 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
-static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
-static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }

-// default C implementations
-VP8BlockCopy VP8Copy4x4 = Copy4x4;
-VP8BlockCopy VP8Copy8x8 = Copy8x8;
-VP8BlockCopy VP8Copy16x16 = Copy16x16;
+//------------------------------------------------------------------------------
+// Initialization

-//-----------------------------------------------------------------------------
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+VP8BlockCopy VP8Copy4x4;
+
+extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitAVX2(void);
+extern void VP8EncDspInitNEON(void);
+extern void VP8EncDspInitMIPS32(void);

 void VP8EncDspInit(void) {
+  VP8DspInit();  // common inverse transforms
  InitTables();
+
+  // default C implementations
+  VP8CollectHistogram = CollectHistogram;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransformWHT = FTransformWHT;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8Copy4x4 = Copy4x4;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_AVX2)
+    if (VP8GetCPUInfo(kAVX2)) {
+      VP8EncDspInitAVX2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspInitMIPS32();
+    }
+#endif
+  }
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dsp/enc_avx2.c
+++ b/src/dsp/enc_avx2.c
@ -0,0 +1,24 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// AVX2 version of speed-critical encoding functions.
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_AVX2)
+
+#endif  // WEBP_USE_AVX2
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitAVX2(void);
+
+void VP8EncDspInitAVX2(void) {
+}
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -0,0 +1,767 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
+//            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+//            Slobodan Prijic  (slobodan.prijic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/vp8enci.h"
+#include "../enc/cost.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// macro for one vertical pass in ITransformOne
+// MUL macro inlined
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from in buffer
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+// TEMP4..TEMP5 - temporary registers
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
+  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
+  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
+  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
+  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
+  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
+  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
+  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
+  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
+  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
+  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
+  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
+  "sra     %[temp18],      %[temp18],      16              \n\n"            \
+  "sra     %[temp17],      %[temp17],      16              \n\n"            \
+  "sra     %[temp19],      %[temp19],      16              \n\n"            \
+  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
+  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
+  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
+  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
+  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
+  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+
+// macro for one horizontal pass in ITransformOne
+// MUL and STORE macros inlined
+// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from ref and store to dst buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
+  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
+  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
+  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
+  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
+  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
+  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
+  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
+  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
+  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
+  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
+  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
+  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
+  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
+  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
+  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
+  "lw      %[temp20],      0(%[args])                      \n\t"            \
+  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
+  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
+  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
+  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
+  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
+  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
+  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
+  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
+  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
+  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
+  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
+  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
+  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
+  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
+  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
+  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
+  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
+  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
+  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
+  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
+  "addiu   %[temp20],      $zero,          255             \n\t"            \
+  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
+  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
+  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
+  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
+  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
+  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
+  "lw      %[temp16],      8(%[args])                      \n\t"            \
+  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
+  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
+  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
+  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
+  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
+  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+
+// Does one or two inverse transforms.
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
+  const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
+
+  __asm__ volatile(
+    "lw      %[temp20],      4(%[args])                      \n\t"
+    VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3)
+    VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7)
+    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
+    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
+
+    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in,
+                       uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// macro for one pass through for loop in QuantizeBlock
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+#define QUANTIZE_ONE(J, K, N)                                               \
+  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
+  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
+  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "sra          %[sign],        %[temp0],           15              \n\t"   \
+  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
+  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
+  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
+  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
+  "addiu        %[temp5],       $zero,              0               \n\t"   \
+  "addiu        %[level],       $zero,              0               \n\t"   \
+  "beqz         %[temp4],       2f                                  \n\t"   \
+  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
+  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
+  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
+  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
+  "sra          %[level],       %[level],           17              \n\t"   \
+  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
+  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
+  "xor          %[level],       %[level],           %[sign]         \n\t"   \
+  "subu         %[level],       %[level],           %[sign]         \n\t"   \
+  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
+"2:                                                                 \n\t"   \
+  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
+  "sh           %[level],       "#N"(%[pout])                       \n\t"
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int sign, coeff, level, i;
+  int max_level = MAX_LEVEL;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile(
+    QUANTIZE_ONE( 0,  0,  0)
+    QUANTIZE_ONE( 2,  4,  2)
+    QUANTIZE_ONE( 8, 16,  4)
+    QUANTIZE_ONE(16, 32,  6)
+    QUANTIZE_ONE(10, 20,  8)
+    QUANTIZE_ONE( 4,  8, 10)
+    QUANTIZE_ONE( 6, 12, 12)
+    QUANTIZE_ONE(12, 24, 14)
+    QUANTIZE_ONE(18, 36, 16)
+    QUANTIZE_ONE(24, 48, 18)
+    QUANTIZE_ONE(26, 52, 20)
+    QUANTIZE_ONE(20, 40, 22)
+    QUANTIZE_ONE(14, 28, 24)
+    QUANTIZE_ONE(22, 44, 26)
+    QUANTIZE_ONE(28, 56, 28)
+    QUANTIZE_ONE(30, 60, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level)
+    : [pout]"r"(pout), [ppin]"r"(ppin),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  // moved out from macro to increase possibility for earlier breaking
+  for (i = 15; i >= 0; i--) {
+    if (out[i]) return 1;
+  }
+  return 0;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// A..D - offsets in bytes to load from a and b buffers
+// E..H - offsets in bytes to store first results to tmp buffer
+// E1..H1 - offsets in bytes to store second results to tmp buffer
+#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
+  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
+  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
+  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
+  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
+  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
+  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
+  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
+  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
+  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
+  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
+  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
+  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
+  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
+  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
+  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
+  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+
+// macro for one vertical pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// since only one accu is available in mips32r1 instruction set
+//   first is done second call of function TTransform and after
+//   that first one.
+//   const int sum1 = TTransform(a, w);
+//   const int sum2 = TTransform(b, w);
+//   return abs(sum2 - sum1) >> 5;
+//   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
+// A..D - offsets in bytes to load first results from tmp buffer
+// A1..D1 - offsets in bytes to load second results from tmp buffer
+// E..H - offsets in bytes to load from w buffer
+#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
+  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
+  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
+  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
+  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp4],  %[temp3],    31         \n\t"                \
+  "sra    %[temp5],  %[temp1],    31         \n\t"                \
+  "sra    %[temp6],  %[temp0],    31         \n\t"                \
+  "sra    %[temp7],  %[temp8],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
+  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
+  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
+  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "madd   %[temp4],  %[temp3]                \n\t"                \
+  "madd   %[temp5],  %[temp1]                \n\t"                \
+  "madd   %[temp6],  %[temp0]                \n\t"                \
+  "madd   %[temp7],  %[temp8]                \n\t"                \
+  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
+  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
+  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
+  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp2],  %[temp3],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "msub   %[temp4],  %[temp3]                \n\t"                \
+  "sra    %[temp2],  %[temp8],    31         \n\t"                \
+  "sra    %[temp3],  %[temp0],    31         \n\t"                \
+  "sra    %[temp4],  %[temp1],    31         \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "msub   %[temp5],  %[temp8]                \n\t"                \
+  "msub   %[temp6],  %[temp0]                \n\t"                \
+  "msub   %[temp7],  %[temp1]                \n\t"
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  int tmp[32];
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+  __asm__ volatile(
+    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
+    "mthi   $zero                             \n\t"
+    "mtlo   $zero                             \n\t"
+    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
+    VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26)
+    VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28)
+    VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30)
+    "mflo   %[temp0]                          \n\t"
+    "sra    %[temp1],  %[temp0],  31          \n\t"
+    "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "sra    %[temp0],  %[temp0],  5           \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
+    : "memory", "hi", "lo"
+  );
+
+  return temp0;
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
+  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
+  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
+  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
+  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
+  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
+  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
+  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
+  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
+  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
+  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
+  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
+  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
+  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
+  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
+  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
+  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
+  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
+  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
+  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
+  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
+  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
+  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
+  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
+  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
+  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
+  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
+  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
+  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
+  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
+  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
+  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
+  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
+  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
+  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
+  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
+  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
+  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
+  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
+  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
+  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
+  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
+  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile(
+    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
+    "lw   %[temp20],    8(%[args])                     \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// Forward declaration.
+extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
+
+int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  const uint16_t* t = res->cost[n][ctx0];
+  int cost;
+  const int const_2 = 2;
+  const int const_255 = 255;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  int res_cost;
+  int res_prob;
+  int res_coeffs;
+  int res_last;
+  int v_reg;
+  int b_reg;
+  int ctx_reg;
+  int cost_add, temp_1, temp_2, temp_3;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  res_cost = (int)res->cost;
+  res_prob = (int)res->prob;
+  res_coeffs = (int)res->coeffs;
+  res_last = (int)res->last;
+
+  __asm__ volatile(
+    ".set   push                                                           \n\t"
+    ".set   noreorder                                                      \n\t"
+
+    "sll    %[temp_1],     %[n],              1                            \n\t"
+    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
+    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
+    "bnez   %[temp_2],     1f                                              \n\t"
+    " li    %[cost_add],   0                                               \n\t"
+    "b      2f                                                             \n\t"
+    " nop                                                                  \n\t"
+  "1:                                                                      \n\t"
+    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
+    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
+    "move   %[temp_1],     %[const_max_level]                              \n\t"
+    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
+    "negu   %[temp_2],     %[v_reg]                                        \n\t"
+    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
+    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
+    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
+    "li     %[cost_add],   0                                               \n\t"
+
+    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
+    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
+    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
+    //  cost += VP8LevelCost(t, v);
+    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
+    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
+    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
+    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
+    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
+    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
+    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
+    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
+    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
+
+    //  t = res->cost[b][ctx];
+    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
+    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
+    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
+    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
+    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
+    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
+    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
+    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
+    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
+    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
+    "addiu  %[n],          %[n],              1                            \n\t"
+    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
+    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
+    "bnez   %[temp_1],     1b                                              \n\t"
+    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
+   "2:                                                                     \n\t"
+
+    ".set   pop                                                            \n\t"
+    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
+      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
+    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
+      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
+      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
+      [res_cost]"r"(res_cost)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+#define GET_SSE_INNER(A, B, C, D)                               \
+  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
+  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
+  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
+  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
+  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
+  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
+  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
+  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
+  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
+  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
+  "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \
+  "madd    %[temp0],    %[temp0]                     \n\t"      \
+  "madd    %[temp2],    %[temp2]                     \n\t"      \
+  "madd    %[temp4],    %[temp4]                     \n\t"      \
+  "madd    %[temp6],    %[temp6]                     \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \
+  GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \
+  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
+  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(  0,   4,   8,  12)
+     GET_SSE( 16,  20,  24,  28)
+     GET_SSE( 32,  36,  40,  44)
+     GET_SSE( 48,  52,  56,  60)
+     GET_SSE( 64,  68,  72,  76)
+     GET_SSE( 80,  84,  88,  92)
+     GET_SSE( 96, 100, 104, 108)
+     GET_SSE(112, 116, 120, 124)
+     GET_SSE(128, 132, 136, 140)
+     GET_SSE(144, 148, 152, 156)
+     GET_SSE(160, 164, 168, 172)
+     GET_SSE(176, 180, 184, 188)
+     GET_SSE(192, 196, 200, 204)
+     GET_SSE(208, 212, 216, 220)
+     GET_SSE(224, 228, 232, 236)
+     GET_SSE(240, 244, 248, 252)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(  0,   4,   8,  12)
+     GET_SSE( 16,  20,  24,  28)
+     GET_SSE( 32,  36,  40,  44)
+     GET_SSE( 48,  52,  56,  60)
+     GET_SSE( 64,  68,  72,  76)
+     GET_SSE( 80,  84,  88,  92)
+     GET_SSE( 96, 100, 104, 108)
+     GET_SSE(112, 116, 120, 124)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE( 0,   4,  16,  20)
+     GET_SSE(32,  36,  48,  52)
+     GET_SSE(64,  68,  80,  84)
+     GET_SSE(96, 100, 112, 116)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(0, 16, 32, 48)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+#undef GET_SSE_MIPS32
+#undef GET_SSE_MIPS32_INNER
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPS32(void);
+
+void VP8EncDspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8ITransform = ITransform;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8FTransform = FTransform;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+#endif  // WEBP_USE_MIPS32
+}
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -0,0 +1,982 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of speed-critical encoding functions.
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>  // for abs()
+#include <emmintrin.h>
+
+#include "../enc/cost.h"
+#include "../enc/vp8enci.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// Quite useful macro for debugging. Left here for convenience.
+
+#if 0
+#include <stdio.h>
+static void PrintReg(const __m128i r, const char* const name, int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  printf("%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]);
+  }
+  printf("\n");
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
+      const __m128i sign0 = _mm_srai_epi16(out0, 15);
+      const __m128i sign1 = _mm_srai_epi16(out1, 15);
+      // abs(out) = (out ^ sign) - sign
+      const __m128i xor0 = _mm_xor_si128(out0, sign0);
+      const __m128i xor1 = _mm_xor_si128(out1, sign1);
+      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
+      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
+      // v = abs(out) >> 3
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      histo->distribution[out[k]]++;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Does one or two inverse transforms.
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two inverse
+  // transforms in parallel). In the case of only one inverse transform, the
+  // second half of the vectors will just contain random value we'll never
+  // use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Add inverse transform to 'ref' and store.
+  {
+    const __m128i zero = _mm_setzero_si128();
+    // Load the reference(s).
+    __m128i ref0, ref1, ref2, ref3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
+      ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
+      ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
+      ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
+      ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
+      ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
+      ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
+    }
+    // Convert to 16b.
+    ref0 = _mm_unpacklo_epi8(ref0, zero);
+    ref1 = _mm_unpacklo_epi8(ref1, zero);
+    ref2 = _mm_unpacklo_epi8(ref2, zero);
+    ref3 = _mm_unpacklo_epi8(ref3, zero);
+    // Add the inverse transform(s).
+    ref0 = _mm_add_epi16(ref0, T0);
+    ref1 = _mm_add_epi16(ref1, T1);
+    ref2 = _mm_add_epi16(ref2, T2);
+    ref3 = _mm_add_epi16(ref3, T3);
+    // Unsigned saturate to 8b.
+    ref0 = _mm_packus_epi16(ref0, ref0);
+    ref1 = _mm_packus_epi16(ref1, ref1);
+    ref2 = _mm_packus_epi16(ref2, ref2);
+    ref3 = _mm_packus_epi16(ref3, ref3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+    } else {
+      // Store four bytes/pixels per line.
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
+    }
+  }
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i seven = _mm_set1_epi16(7);
+  const __m128i k937 = _mm_set1_epi32(937);
+  const __m128i k1812 = _mm_set1_epi32(1812);
+  const __m128i k51000 = _mm_set1_epi32(51000);
+  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
+                                           5352,  2217, 5352,  2217);
+  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+                                           2217, -5352, 2217, -5352);
+  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+                                            2217, 5352, 2217, 5352);
+  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+                                            -5352, 2217, -5352, 2217);
+  __m128i v01, v32;
+
+
+  // Difference between src and ref and initial transpose.
+  {
+    // Load src and convert to 16b.
+    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
+    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
+    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
+    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
+    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+    // Load ref and convert to 16b.
+    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
+    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
+    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
+    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+    // Compute difference. -> 00 01 02 03 00 00 00 00
+    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+
+    // Unpack and shuffle
+    // 00 01 02 03   0 0 0 0
+    // 10 11 12 13   0 0 0 0
+    // 20 21 22 23   0 0 0 0
+    // 30 31 32 33   0 0 0 0
+    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
+    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
+    // 00 01 10 11 02 03 12 13
+    // 20 21 30 31 22 23 32 33
+    const __m128i shuf01_p =
+        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i shuf23_p =
+        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
+    // 00 01 10 11 03 02 13 12
+    // 20 21 30 31 23 22 33 32
+    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+    // 00 01 10 11 20 21 30 31
+    // 03 02 13 12 23 22 33 32
+    const __m128i a01 = _mm_add_epi16(s01, s32);
+    const __m128i a32 = _mm_sub_epi16(s01, s32);
+    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+
+    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
+    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
+    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
+    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
+    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
+    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
+    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
+    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
+    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
+    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
+  }
+
+  // Second pass
+  {
+    // Same operations are done on the (0,3) and (1,2) pairs.
+    // a0 = v0 + v3
+    // a1 = v1 + v2
+    // a3 = v0 - v3
+    // a2 = v1 - v2
+    const __m128i a01 = _mm_add_epi16(v01, v32);
+    const __m128i a32 = _mm_sub_epi16(v01, v32);
+    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+
+    // d0 = (a0 + a1 + 7) >> 4;
+    // d2 = (a0 - a1 + 7) >> 4;
+    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+    const __m128i d0 = _mm_srai_epi16(c0, 4);
+    const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+    const __m128i d3 = _mm_add_epi32(c3, k51000);
+    const __m128i e1 = _mm_srai_epi32(d1, 16);
+    const __m128i e3 = _mm_srai_epi32(d3, 16);
+    const __m128i f1 = _mm_packs_epi32(e1, e1);
+    const __m128i f3 = _mm_packs_epi32(e3, e3);
+    // f1 = f1 + (a3 != 0);
+    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+    // desired (0, 1), we add one earlier through k12000_plus_one.
+    // -> f1 = f1 + 1 - (a3 == 0)
+    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+  }
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  int32_t tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]);
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  {
+    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
+    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
+    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
+    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
+    const __m128i a0 = _mm_add_epi32(src0, src2);
+    const __m128i a1 = _mm_add_epi32(src1, src3);
+    const __m128i a2 = _mm_sub_epi32(src1, src3);
+    const __m128i a3 = _mm_sub_epi32(src0, src2);
+    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
+    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
+    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
+    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
+    const __m128i out0 = _mm_packs_epi32(b0, b1);
+    const __m128i out1 = _mm_packs_epi32(b2, b3);
+    _mm_storeu_si128((__m128i*)&out[0], out0);
+    _mm_storeu_si128((__m128i*)&out[8], out1);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static int SSE_Nx4(const uint8_t* a, const uint8_t* b,
+                   int num_quads, int do_16) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum1 = zero;
+  __m128i sum2 = zero;
+
+  while (num_quads-- > 0) {
+    // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,
+    // thanks to buffer over-allocation to that effect.
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]);
+
+    // compute clip0(a-b) and clip0(b-a)
+    const __m128i a0p = _mm_subs_epu8(a0, b0);
+    const __m128i a0m = _mm_subs_epu8(b0, a0);
+    const __m128i a1p = _mm_subs_epu8(a1, b1);
+    const __m128i a1m = _mm_subs_epu8(b1, a1);
+    const __m128i a2p = _mm_subs_epu8(a2, b2);
+    const __m128i a2m = _mm_subs_epu8(b2, a2);
+    const __m128i a3p = _mm_subs_epu8(a3, b3);
+    const __m128i a3m = _mm_subs_epu8(b3, a3);
+
+    // compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
+    const __m128i diff0 = _mm_or_si128(a0p, a0m);
+    const __m128i diff1 = _mm_or_si128(a1p, a1m);
+    const __m128i diff2 = _mm_or_si128(a2p, a2m);
+    const __m128i diff3 = _mm_or_si128(a3p, a3m);
+
+    // unpack (only four operations, instead of eight)
+    const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
+    const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
+    const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
+    const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);
+
+    // multiply with self
+    const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
+    const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
+    const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
+    const __m128i low_madd3 = _mm_madd_epi16(low3, low3);
+
+    // collect in a cascading way
+    const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
+    const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
+    sum1 = _mm_add_epi32(sum1, low_sum0);
+    sum2 = _mm_add_epi32(sum2, low_sum1);
+
+    if (do_16) {  // if necessary, process the higher 8 bytes similarly
+      const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
+      const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
+      const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
+      const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);
+
+      const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
+      const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
+      const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
+      const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
+      const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
+      const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
+      sum1 = _mm_add_epi32(sum1, hi_sum0);
+      sum2 = _mm_add_epi32(sum2, hi_sum1);
+    }
+    a += 4 * BPS;
+    b += 4 * BPS;
+  }
+  {
+    int32_t tmp[4];
+    const __m128i sum = _mm_add_epi32(sum1, sum2);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+}
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4(a, b, 4, 1);
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4(a, b, 2, 1);
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4(a, b, 2, 0);
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load values. Note that we read 8 pixels instead of 4,
+  // but the a/b buffers are over-allocated to that effect.
+  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
+  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
+  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
+  const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
+  const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
+  const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
+  const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
+  const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
+
+  // Combine pair of lines and convert to 16b.
+  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
+  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
+  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
+  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
+  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+
+  // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
+  // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
+  //                  need absolute values, there is no need to do calculation
+  //                  in 8bit as we are already in 16bit, ... Yet this is what
+  //                  benchmarks the fastest!
+  const __m128i d0 = _mm_subs_epu8(a01s, b01s);
+  const __m128i d1 = _mm_subs_epu8(b01s, a01s);
+  const __m128i d2 = _mm_subs_epu8(a23s, b23s);
+  const __m128i d3 = _mm_subs_epu8(b23s, a23s);
+
+  // Square and add them all together.
+  const __m128i madd0 = _mm_madd_epi16(d0, d0);
+  const __m128i madd1 = _mm_madd_epi16(d1, d1);
+  const __m128i madd2 = _mm_madd_epi16(d2, d2);
+  const __m128i madd3 = _mm_madd_epi16(d3, d3);
+  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
+
+  int32_t tmp[4];
+  _mm_storeu_si128((__m128i*)tmp, sum2);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the difference between the weighted sum of the absolute value of
+// transformed coefficients.
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+                      const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load, combine and transpose inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
+    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
+    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
+    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
+    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
+
+    // Transpose the two 4x4, discarding the filling zeroes.
+    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
+    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
+    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
+    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
+    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
+
+    // Convert to 16b.
+    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
+    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
+    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
+    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Vertical pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    // TODO(cduvivier): Make variable declarations and allocations aligned so
+    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
+    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    {
+      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
+      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
+      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
+      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
+      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
+
+      // b = abs(b) = (b ^ sign) - sign
+      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
+      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
+      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
+      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
+      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
+      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
+      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
+      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
+    }
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b0 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int diff_sum = TTransform(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
+                                       const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i coeff0, coeff8;
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  // TODO(cduvivier): Make variable declarations and allocations aligned so that
+  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
+
+  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
+  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
+
+  // coeff = abs(in) = (in ^ sign) - sign
+  coeff0 = _mm_xor_si128(in0, sign0);
+  coeff8 = _mm_xor_si128(in8, sign8);
+  coeff0 = _mm_sub_epi16(coeff0, sign0);
+  coeff8 = _mm_sub_epi16(coeff8, sign8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // get sign back (if (sign[j]) out_n = -out_n)
+  out0 = _mm_xor_si128(out0, sign0);
+  out8 = _mm_xor_si128(out8, sign8);
+  out0 = _mm_sub_epi16(out0, sign0);
+  out8 = _mm_sub_epi16(out8, sign8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it.
+  //
+  // The zigzag pattern can almost be reproduced with a small sequence of
+  // shuffles. After it, we only need to swap the 7th (ending up in third
+  // position instead of twelfth) and 8th values.
+  {
+    __m128i outZ0, outZ8;
+    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
+    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
+    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
+    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
+    _mm_storeu_si128((__m128i*)&out[0], outZ0);
+    _mm_storeu_si128((__m128i*)&out[8], outZ8);
+    packed_out = _mm_packs_epi16(outZ0, outZ8);
+  }
+  {
+    const int16_t outZ_12 = out[12];
+    const int16_t outZ_3 = out[3];
+    out[3] = outZ_12;
+    out[12] = outZ_3;
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, NULL, mtx);
+}
+
+// Forward declaration.
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                              VP8Residual* const res);
+
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE to compare 8 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
+  const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
+  // Get the comparison results as a bitmask, consisting of two times 16 bits:
+  // two identical bits for each result. Concatenate both bitmasks to get a
+  // single 32 bit value. Negate the mask to get the position of entries that
+  // are not equal to zero. We don't need to mask out least significant bits
+  // according to res->first, since coeffs[0] is 0 if res->first > 0
+  const uint32_t mask =
+      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value. Divide the result by two because __movemask_epi8
+  // operates on 8 bit values instead of 16 bit values.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
+  res->coeffs = coeffs;
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE2(void);
+
+void VP8EncDspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8CollectHistogram = CollectHistogram;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransformWHT = FTransformWHT;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+#endif   // WEBP_USE_SSE2
+}
+
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -0,0 +1,249 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+
+#ifndef WEBP_DSP_LOSSLESS_H_
+#define WEBP_DSP_LOSSLESS_H_
+
+#include "../webp/types.h"
+#include "../webp/decode.h"
+
+#include "../enc/histogram.h"
+#include "../utils/utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Signatures and generic function-pointers
+
+typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+extern VP8LPredictorFunc VP8LPredictors[16];
+
+typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
+extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+
+typedef struct {
+  // Note: the members are uint8_t, so that any negative values are
+  // automatically converted to "mod 256" values.
+  uint8_t green_to_red_;
+  uint8_t green_to_blue_;
+  uint8_t red_to_blue_;
+} VP8LMultipliers;
+typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+                                       uint32_t* argb_data, int num_pixels);
+extern VP8LTransformColorFunc VP8LTransformColor;
+extern VP8LTransformColorFunc VP8LTransformColorInverse;
+
+typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
+                                uint8_t* dst);
+extern VP8LConvertFunc VP8LConvertBGRAToRGB;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
+extern VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+// Expose some C-only fallback functions
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+                                 uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+
+// Must be called before calling any of the above methods.
+void VP8LDspInit(void);
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+struct VP8LTransform;  // Defined in dec/vp8li.h.
+
+// Performs inverse transform of data given transform information, start and end
+// rows. Transform will be applied to rows [row_start, row_end[.
+// The *in and *out pointers refer to source and destination data respectively
+// corresponding to the intermediate row (row_start).
+void VP8LInverseTransform(const struct VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out);
+
+// Similar to the static method ColorIndexInverseTransform() that is part of
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
+// uint32_t) arguments for 'src' and 'dst'.
+void VP8LColorIndexInverseTransformAlpha(
+    const struct VP8LTransform* const transform, int y_start, int y_end,
+    const uint8_t* src, uint8_t* dst);
+
+void VP8LResidualImage(int width, int height, int bits,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
+// Color space conversion.
+
+// Converts from BGRA to other color spaces.
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+
+//------------------------------------------------------------------------------
+// Misc methods.
+
+// Computes sampled size of 'size' when sampling using 'sampling bits'.
+static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
+                                              uint32_t sampling_bits) {
+  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
+}
+
+// -----------------------------------------------------------------------------
+// Faster logarithm for integers. Small values use a look-up table.
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+// -----------------------------------------------------------------------------
+// Huffman-cost related functions.
+
+typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+                                       int length);
+
+extern VP8LCostFunc VP8LExtraCost;
+extern VP8LCostCombinedFunc VP8LExtraCostCombined;
+
+typedef struct {        // small struct to hold counters
+  int counts[2];        // index: 0=zero steak, 1=non-zero streak
+  int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
+} VP8LStreaks;
+
+typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
+                                         int length);
+typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
+                                                 const uint32_t* Y, int length);
+
+extern VP8LCostCountFunc VP8LHuffmanCostCount;
+extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
+
+typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out);
+extern VP8LHistogramAddFunc VP8LHistogramAdd;
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return log_floor;
+  else
+    return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+                                                  int* const extra_bits) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+                                              int* const extra_bits,
+                                              int* const extra_bits_value) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX   512
+typedef struct {
+  int8_t code_;
+  int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+                                             int* const extra_bits) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+  } else {
+    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+  }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+                                         int* const extra_bits,
+                                         int* const extra_bits_value) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+  } else {
+    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+  }
+}
+
+// In-place difference of each component with mod 256.
+static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_LOSSLESS_H_
--- a/src/dsp/lossless_mips32.c
+++ b/src/dsp/lossless_mips32.c
@ -0,0 +1,416 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of lossless functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+#include "./lossless.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+
+static float FastSLog2Slow(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y, correction;
+    const int c24 = 24;
+    const float v_f = (float)v;
+    uint32_t temp;
+
+    // Xf = 256 = 2^8
+    // log_cnt is index of leading one in upper 24 bits
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+
+    // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
+    correction = (23 * (v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[temp] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y;
+    const int c24 = 24;
+    double log_2;
+    uint32_t temp;
+
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    log_2 = kLog2Table[temp] + log_cnt;
+    if (v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+
+      const uint32_t correction = (23 * (v & (y - 1))) >> 4;
+      log_2 += (double)correction / v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pop = &population[4];
+//   const uint32_t* LoopEnd = &population[length];
+//   while (pop != LoopEnd) {
+//     ++i;
+//     cost += i * *pop;
+//     cost += i * *(pop + 1);
+//     pop += 2;
+//   }
+//   return (double)cost;
+static double ExtraCost(const uint32_t* const population, int length) {
+  int i, temp0, temp1;
+  const uint32_t* pop = &population[4];
+  const uint32_t* const LoopEnd = &population[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pop],   %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pop])              \n\t"
+    "lw     %[temp1], 4(%[pop])              \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addiu  %[pop],   %[pop],     8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp1]               \n\t"
+    "bne    %[pop],   %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [i]"=&r"(i), [pop]"+r"(pop)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pX = &X[4];
+//   const uint32_t* pY = &Y[4];
+//   const uint32_t* LoopEnd = &X[length];
+//   while (pX != LoopEnd) {
+//     const uint32_t xy0 = *pX + *pY;
+//     const uint32_t xy1 = *(pX + 1) + *(pY + 1);
+//     ++i;
+//     cost += i * xy0;
+//     cost += i * xy1;
+//     pX += 2;
+//     pY += 2;
+//   }
+//   return (double)cost;
+static double ExtraCostCombined(const uint32_t* const X,
+                                const uint32_t* const Y, int length) {
+  int i, temp0, temp1, temp2, temp3;
+  const uint32_t* pX = &X[4];
+  const uint32_t* pY = &Y[4];
+  const uint32_t* const LoopEnd = &X[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pX],    %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pX])               \n\t"
+    "lw     %[temp1], 0(%[pY])               \n\t"
+    "lw     %[temp2], 4(%[pX])               \n\t"
+    "lw     %[temp3], 4(%[pY])               \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addu   %[temp0], %[temp0],   %[temp1]   \n\t"
+    "addu   %[temp2], %[temp2],   %[temp3]   \n\t"
+    "addiu  %[pX],    %[pX],      8          \n\t"
+    "addiu  %[pY],    %[pY],      8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp2]               \n\t"
+    "bne    %[pX],    %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+#define HUFFMAN_COST_PASS                                 \
+  __asm__ volatile(                                       \
+    "sll   %[temp1],  %[temp0],    3           \n\t"      \
+    "addiu %[temp3],  %[streak],   -3          \n\t"      \
+    "addu  %[temp2],  %[pstreaks], %[temp1]    \n\t"      \
+    "blez  %[temp3],  1f                       \n\t"      \
+    "srl   %[temp1],  %[temp1],    1           \n\t"      \
+    "addu  %[temp3],  %[pcnts],    %[temp1]    \n\t"      \
+    "lw    %[temp0],  4(%[temp2])              \n\t"      \
+    "lw    %[temp1],  0(%[temp3])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "addiu %[temp1],  %[temp1],    1           \n\t"      \
+    "sw    %[temp0],  4(%[temp2])              \n\t"      \
+    "sw    %[temp1],  0(%[temp3])              \n\t"      \
+    "b     2f                                  \n\t"      \
+  "1:                                          \n\t"      \
+    "lw    %[temp0],  0(%[temp2])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "sw    %[temp0],  0(%[temp2])              \n\t"      \
+  "2:                                          \n\t"      \
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),           \
+      [temp3]"=&r"(temp3), [temp0]"+r"(temp0)             \
+    : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts),         \
+      [streak]"r"(streak)                                 \
+    : "memory"                                            \
+  );
+
+// Returns the various RLE counts
+static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  int* const pstreaks = &stats.streaks[0][0];
+  int* const pcnts = &stats.counts[0];
+  int temp0, temp1, temp2, temp3;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+    temp0 = (population[i] != 0);
+    HUFFMAN_COST_PASS
+    streak = 0;
+  }
+  ++streak;
+  temp0 = (population[i] != 0);
+  HUFFMAN_COST_PASS
+
+  return stats;
+}
+
+static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
+                                            const uint32_t* Y, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  int* const pstreaks = &stats.streaks[0][0];
+  int* const pcnts = &stats.counts[0];
+  int temp0, temp1, temp2, temp3;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    const uint32_t xy_next = X[i + 1] + Y[i + 1];
+    ++streak;
+    if (xy == xy_next) {
+      continue;
+    }
+    temp0 = (xy != 0);
+    HUFFMAN_COST_PASS
+    streak = 0;
+  }
+  {
+    const uint32_t xy = X[i] + Y[i];
+    ++streak;
+    temp0 = (xy != 0);
+    HUFFMAN_COST_PASS
+  }
+
+  return stats;
+}
+
+#define ASM_START                                       \
+  __asm__ volatile(                                     \
+    ".set   push                            \n\t"       \
+    ".set   at                              \n\t"       \
+    ".set   macro                           \n\t"       \
+  "1:                                       \n\t"
+
+// P2 = P0 + P1
+// A..D - offsets
+// E - temp variable to tell macro
+//     if pointer should be incremented
+// literal_ and successive histograms could be unaligned
+// so we must use ulw and usw
+#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
+    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
+    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
+    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
+    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
+    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
+    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
+    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
+    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
+    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
+    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
+    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
+    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
+    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
+  ".if "#E" == 1                            \n\t"       \
+    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
+  ".endif                                   \n\t"       \
+    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
+    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
+    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
+    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
+    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
+    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
+    ".set   pop                             \n\t"       \
+
+#define ASM_END_COMMON_0                                \
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),         \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),         \
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),         \
+      [pa]"+r"(pa), [pout]"+r"(pout)
+
+#define ASM_END_COMMON_1                                \
+    : [LoopEnd]"r"(LoopEnd)                             \
+    : "memory", "at"                                    \
+  );
+
+#define ASM_END_0                                       \
+    ASM_END_COMMON_0                                    \
+      , [pb]"+r"(pb)                                    \
+    ASM_END_COMMON_1
+
+#define ASM_END_1                                       \
+    ASM_END_COMMON_0                                    \
+    ASM_END_COMMON_1
+
+#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE)  do {   \
+  const uint32_t* pa = (const uint32_t*)(A);            \
+  const uint32_t* pb = (const uint32_t*)(B);            \
+  uint32_t* pout = (uint32_t*)(OUT);                    \
+  const uint32_t* const LoopEnd = pa + (SIZE);          \
+  assert((SIZE) % 4 == 0);                              \
+  ASM_START                                             \
+  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)              \
+  ASM_END_0                                             \
+  if ((EXTRA_SIZE) > 0) {                               \
+    const int last = (EXTRA_SIZE);                      \
+    int i;                                              \
+    for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
+  }                                                     \
+} while (0)
+
+#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE)  do {   \
+  const uint32_t* pa = (const uint32_t*)(A);            \
+  uint32_t* pout = (uint32_t*)(OUT);                    \
+  const uint32_t* const LoopEnd = pa + (SIZE);          \
+  assert((SIZE) % 4 == 0);                              \
+  ASM_START                                             \
+  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)            \
+  ASM_END_1                                             \
+  if ((EXTRA_SIZE) > 0) {                               \
+    const int last = (EXTRA_SIZE);                      \
+    int i;                                              \
+    for (i = 0; i < last; ++i) pout[i] += pa[i];        \
+  }                                                     \
+} while (0)
+
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
+                             - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+
+  if (b != out) {
+    ADD_VECTOR(a->literal_, b->literal_, out->literal_,
+               NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
+    ADD_VECTOR(a->distance_, b->distance_, out->distance_,
+               NUM_DISTANCE_CODES, 0);
+    ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
+  } else {
+    ADD_VECTOR_EQ(a->literal_, out->literal_,
+                  NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
+    ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
+    ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
+  }
+}
+
+#undef ADD_VECTOR_EQ
+#undef ADD_VECTOR
+#undef ASM_END_1
+#undef ASM_END_0
+#undef ASM_END_COMMON_1
+#undef ASM_END_COMMON_0
+#undef ADD_TO_OUT
+#undef ASM_START
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPS32(void);
+
+void VP8LDspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8LFastSLog2Slow = FastSLog2Slow;
+  VP8LFastLog2Slow = FastLog2Slow;
+  VP8LExtraCost = ExtraCost;
+  VP8LExtraCostCombined = ExtraCostCombined;
+  VP8LHuffmanCostCount = HuffmanCostCount;
+  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
+  VP8LHistogramAdd = HistogramAdd;
+#endif  // WEBP_USE_MIPS32
+}
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -0,0 +1,332 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "./lossless.h"
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#if !defined(WORK_AROUND_GCC)
+// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
+// gcc-4.8.x at least.
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~15);
+  for (; src < end; src += 16) {
+    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
+    const uint8x16_t tmp = pixel.val[0];
+    pixel.val[0] = pixel.val[2];
+    pixel.val[2] = tmp;
+    vst4q_u8(dst, pixel);
+    dst += 64;
+  }
+  VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~15);
+  for (; src < end; src += 16) {
+    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
+    vst3q_u8(dst, tmp);
+    dst += 48;
+  }
+  VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~15);
+  for (; src < end; src += 16) {
+    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
+    vst3q_u8(dst, tmp);
+    dst += 48;
+  }
+  VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst);  // left-overs
+}
+
+#else  // WORK_AROUND_GCC
+
+// gcc-4.6.0 fallback
+
+static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~1);
+  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
+  for (; src < end; src += 2) {
+    const uint8x8_t pixels = vld1_u8((uint8_t*)src);
+    vst1_u8(dst, vtbl1_u8(pixels, shuffle));
+    dst += 8;
+  }
+  VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst);  // left-overs
+}
+
+static const uint8_t kBGRShuffle[3][8] = {
+  {  0,  1,  2,  4,  5,  6,  8,  9 },
+  { 10, 12, 13, 14, 16, 17, 18, 20 },
+  { 21, 22, 24, 25, 26, 28, 29, 30 }
+};
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~7);
+  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
+  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
+  const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
+  for (; src < end; src += 8) {
+    uint8x8x4_t pixels;
+    INIT_VECTOR4(pixels,
+                 vld1_u8((const uint8_t*)(src + 0)),
+                 vld1_u8((const uint8_t*)(src + 2)),
+                 vld1_u8((const uint8_t*)(src + 4)),
+                 vld1_u8((const uint8_t*)(src + 6)));
+    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
+    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
+    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
+    dst += 8 * 3;
+  }
+  VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst);  // left-overs
+}
+
+static const uint8_t kRGBShuffle[3][8] = {
+  {  2,  1,  0,  6,  5,  4, 10,  9 },
+  {  8, 14, 13, 12, 18, 17, 16, 22 },
+  { 21, 20, 26, 25, 24, 30, 29, 28 }
+};
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~7);
+  const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
+  const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
+  const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
+  for (; src < end; src += 8) {
+    uint8x8x4_t pixels;
+    INIT_VECTOR4(pixels,
+                 vld1_u8((const uint8_t*)(src + 0)),
+                 vld1_u8((const uint8_t*)(src + 2)),
+                 vld1_u8((const uint8_t*)(src + 4)),
+                 vld1_u8((const uint8_t*)(src + 6)));
+    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
+    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
+    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
+    dst += 8 * 3;
+  }
+  VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst);  // left-overs
+}
+
+#endif   // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+
+#ifdef USE_INTRINSICS
+
+static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
+                                     const uint32_t* const b) {
+  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
+  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
+  const uint8x8_t avg = vhadd_u8(a0, b0);
+  return vget_lane_u32(vreinterpret_u32_u8(avg), 0);
+}
+
+static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
+                                     const uint32_t* const b,
+                                     const uint32_t* const c) {
+  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
+  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
+  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
+  const uint8x8_t avg1 = vhadd_u8(a0, c0);
+  const uint8x8_t avg2 = vhadd_u8(avg1, b0);
+  return vget_lane_u32(vreinterpret_u32_u8(avg2), 0);
+}
+
+static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
+                                     const uint32_t* const b,
+                                     const uint32_t* const c,
+                                     const uint32_t* const d) {
+  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
+  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
+  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
+  const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d));
+  const uint8x8_t avg1 = vhadd_u8(a0, b0);
+  const uint8x8_t avg2 = vhadd_u8(c0, d0);
+  const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
+  return vget_lane_u32(vreinterpret_u32_u8(avg3), 0);
+}
+
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  return Average3(&left, top + 0, top + 1);
+}
+
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  return Average2(&left, top - 1);
+}
+
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  return Average2(&left, top + 0);
+}
+
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top - 1, top + 0);
+}
+
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top + 0, top + 1);
+}
+
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  return Average4(&left, top - 1, top + 0, top + 1);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
+                                   const uint32_t* const c1,
+                                   const uint32_t* const c2) {
+  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
+  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
+  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
+  const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
+  const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
+  const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
+  const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
+  const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
+  const int32_t pa_minus_pb = vget_lane_s32(diff, 0);
+  return (pa_minus_pb <= 0) ? *c0 : *c1;
+}
+
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  return Select(top + 0, &left, top - 1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0,
+                                                   const uint32_t* const c1,
+                                                   const uint32_t* const c2) {
+  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
+  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
+  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
+  const uint16x8_t sum0 = vaddl_u8(p0, p1);                // add and widen
+  const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2));  // widen and subtract
+  const uint8x8_t out = vqmovn_u16(sum1);                  // narrow and clamp
+  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
+}
+
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractFull(&left, top + 0, top - 1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
+                                                   const uint32_t* const c1,
+                                                   const uint32_t* const c2) {
+  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
+  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
+  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
+  const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
+  const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
+  const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
+  const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
+  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
+}
+
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractHalf(&left, top + 0, top - 1);
+}
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl? are unavailable in iOS/arm64 builds.
+#if !defined(__aarch64__)
+
+// 255 = byte will be zero'd
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens =
+        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens =
+        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+#endif   // !__aarch64__
+
+#endif   // USE_INTRINSICS
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+
+extern void VP8LDspInitNEON(void);
+
+void VP8LDspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+
+#ifdef USE_INTRINSICS
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+
+#if !defined(__aarch64__)
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+#endif
+#endif
+
+#endif   // WEBP_USE_NEON
+}
+
+//------------------------------------------------------------------------------
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@ -0,0 +1,535 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#include <assert.h>
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+#include "./lossless.h"
+
+//------------------------------------------------------------------------------
+// Predictor Transform
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i V1 = _mm_add_epi16(C0, C1);
+  const __m128i V2 = _mm_sub_epi16(V1, C2);
+  const __m128i b = _mm_packus_epi16(V2, V2);
+  const uint32_t output = _mm_cvtsi128_si32(b);
+  return output;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i avg = _mm_add_epi16(C1, C0);
+  const __m128i A0 = _mm_srli_epi16(avg, 1);
+  const __m128i A1 = _mm_sub_epi16(A0, B0);
+  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
+  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
+  const __m128i A3 = _mm_srai_epi16(A2, 1);
+  const __m128i A4 = _mm_add_epi16(A0, A3);
+  const __m128i A5 = _mm_packus_epi16(A4, A4);
+  const uint32_t output = _mm_cvtsi128_si32(A5);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  int pa_minus_pb;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_cvtsi32_si128(a);
+  const __m128i B0 = _mm_cvtsi32_si128(b);
+  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i AC0 = _mm_subs_epu8(A0, C0);
+  const __m128i CA0 = _mm_subs_epu8(C0, A0);
+  const __m128i BC0 = _mm_subs_epu8(B0, C0);
+  const __m128i CB0 = _mm_subs_epu8(C0, B0);
+  const __m128i AC = _mm_or_si128(AC0, CA0);
+  const __m128i BC = _mm_or_si128(BC0, CB0);
+  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
+  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
+  const __m128i diff = _mm_sub_epi16(pb, pa);
+  {
+    int16_t out[8];
+    _mm_storeu_si128((__m128i*)out, diff);
+    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
+  }
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i sum = _mm_add_epi16(A1, A0);
+  const __m128i avg = _mm_srli_epi16(sum, 1);
+  return avg;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  const __m128i avg = Average2_128i(a0, a1);
+  const __m128i A2 = _mm_packus_epi16(avg, avg);
+  const uint32_t output = _mm_cvtsi128_si32(A2);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i avg1 = Average2_128i(a0, a2);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i sum = _mm_add_epi16(avg1, A1);
+  const __m128i avg2 = _mm_srli_epi16(sum, 1);
+  const __m128i A2 = _mm_packus_epi16(avg2, avg2);
+  const uint32_t output = _mm_cvtsi128_si32(A2);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  const __m128i avg1 = Average2_128i(a0, a1);
+  const __m128i avg2 = Average2_128i(a2, a3);
+  const __m128i sum = _mm_add_epi16(avg2, avg1);
+  const __m128i avg3 = _mm_srli_epi16(sum, 1);
+  const __m128i A0 = _mm_packus_epi16(avg3, avg3);
+  const uint32_t output = _mm_cvtsi128_si32(A0);
+  return output;
+}
+
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
+}
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[-1], top[0]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[0], top[1]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
+}
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
+}
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_add_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
+                                               __m128i color) {
+  // We simulate signed 8-bit multiplication as:
+  // * Left shift the two (8-bit) numbers by 8 bits,
+  // * Perform a 16-bit signed multiplication and retain the higher 16-bits.
+  const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
+  const __m128i color_shifted = _mm_slli_epi32(color, 8);
+  // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
+  // happen to be zeroes.
+  const __m128i signed_mult =
+      _mm_mulhi_epi16(color_pred_shifted, color_shifted);
+  return _mm_srli_epi32(signed_mult, 5);
+}
+
+static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
+                                       uint32_t* argb_data,
+                                       int num_pixels) {
+  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
+  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
+  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
+
+  int i;
+
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
+    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
+    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
+    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
+    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
+    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
+    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
+    const __m128i b = in;
+
+    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
+    const __m128i r_new =
+        _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
+    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
+
+    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
+    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
+    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
+    const __m128i b_new =
+        _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
+
+    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+
+  // Fall-back to C-version for left-overs.
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
+                                              uint32_t* argb_data,
+                                              int num_pixels) {
+  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
+  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
+  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
+
+  int i;
+
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
+    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
+    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
+    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
+    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
+    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
+    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
+    const __m128i b = in;
+
+    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
+    const __m128i r_new =
+        _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
+    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
+
+    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
+    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
+    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
+    const __m128i b_new =
+        _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);
+
+    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+
+  // Fall-back to C-version for left-overs.
+  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  while (num_pixels >= 8) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
+    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
+    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
+    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
+    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
+    _mm_storeu_si128(out++, rgba0);
+    _mm_storeu_si128(out++, rgba4);
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+}
+
+static void ConvertBGRAToRGBA4444(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
+  const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
+  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  while (num_pixels >= 8) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
+    const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
+    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
+    const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
+    const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
+    const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
+#ifdef WEBP_SWAP_16BIT_CSP
+    const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
+#else
+    const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
+#endif
+    _mm_storeu_si128(out++, rgba);
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+}
+
+static void ConvertBGRAToRGB565(const uint32_t* src,
+                                int num_pixels, uint8_t* dst) {
+  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
+  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
+  const __m128i mask_0x07 = _mm_set1_epi8(0x07);
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  while (num_pixels >= 8) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
+    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
+    const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
+    const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
+    const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
+    const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
+    const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
+    const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
+    const __m128i b1 = _mm_srli_epi16(b0, 3);
+    const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
+#ifdef WEBP_SWAP_16BIT_CSP
+    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
+#else
+    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
+#endif
+    _mm_storeu_si128(out++, rgba);
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
+  const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
+  const __m128i* in = (const __m128i*)src;
+  const uint8_t* const end = dst + num_pixels * 3;
+  // the last storel_epi64 below writes 8 bytes starting at offset 18
+  while (dst + 26 <= end) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
+    const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
+    const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
+    const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
+    const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
+    const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
+    const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
+    const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
+    const __m128i c2 = _mm_srli_si128(c0, 8);
+    const __m128i c6 = _mm_srli_si128(c4, 8);
+    _mm_storel_epi64((__m128i*)(dst +   0), c0);
+    _mm_storel_epi64((__m128i*)(dst +   6), c2);
+    _mm_storel_epi64((__m128i*)(dst +  12), c4);
+    _mm_storel_epi64((__m128i*)(dst +  18), c6);
+    dst += 24;
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
+}
+
+//------------------------------------------------------------------------------
+
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                      int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+
+static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+#undef LINE_SIZE
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  } else {
+    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  }
+  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+
+extern void VP8LDspInitSSE2(void);
+
+void VP8LDspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+
+  VP8LTransformColor = TransformColor;
+  VP8LTransformColorInverse = TransformColorInverse;
+
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+
+  VP8LHistogramAdd = HistogramAdd;
+#endif   // WEBP_USE_SSE2
+}
+
+//------------------------------------------------------------------------------
--- a/src/dsp/neon.h
+++ b/src/dsp/neon.h
@ -0,0 +1,82 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  NEON common code.
+
+#ifndef WEBP_DSP_NEON_H_
+#define WEBP_DSP_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./dsp.h"
+
+// Right now, some intrinsics functions seem slower, so we disable them
+// everywhere except aarch64 where the inline assembly is incompatible.
+#if defined(__aarch64__)
+#define USE_INTRINSICS   // use intrinsics when possible
+#endif
+
+#define INIT_VECTOR2(v, a, b) do {  \
+  v.val[0] = a;                     \
+  v.val[1] = b;                     \
+} while (0)
+
+#define INIT_VECTOR3(v, a, b, c) do {  \
+  v.val[0] = a;                        \
+  v.val[1] = b;                        \
+  v.val[2] = c;                        \
+} while (0)
+
+#define INIT_VECTOR4(v, a, b, c, d) do {  \
+  v.val[0] = a;                           \
+  v.val[1] = b;                           \
+  v.val[2] = c;                           \
+  v.val[3] = d;                           \
+} while (0)
+
+// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
+// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
+// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
+#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WORK_AROUND_GCC
+#endif
+
+static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
+  uint64x2x2_t row01, row23;
+
+  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
+  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
+  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
+  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
+  // Transpose 64-bit values (there's no vswp equivalent)
+  {
+    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
+    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
+    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
+    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
+    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
+    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
+    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
+    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
+  }
+  {
+    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
+                                        vreinterpretq_s32_u64(row01.val[1]));
+    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
+                                        vreinterpretq_s32_u64(row23.val[1]));
+    int32x4x4_t out;
+    out.val[0] = out01.val[0];
+    out.val[1] = out01.val[1];
+    out.val[2] = out23.val[0];
+    out.val[3] = out23.val[1];
+    return out;
+  }
+}
+
+#endif  // WEBP_DSP_NEON_H_
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -0,0 +1,222 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+#include "./yuv.h"
+
+#include <assert.h>
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB
+WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    {                                                                          \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    {                                                                          \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+
+#if !defined(FANCY_UPSAMPLING)
+#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* bot_u, const uint8_t* bot_v,              \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
+  const int half_len = len >> 1;                                               \
+  int x;                                                                       \
+  assert(top_dst != NULL);                                                     \
+  {                                                                            \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
+      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x);  \
+  }                                                                            \
+  if (bot_dst != NULL) {                                                       \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x + 0);         \
+      FUNC(bot_y[2 * x + 1], bot_u[x], bot_v[x], bot_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x);  \
+  }                                                                            \
+}
+
+DUAL_SAMPLE_FUNC(DualLineSamplerBGRA, VP8YuvToBgra)
+DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
+#undef DUAL_SAMPLE_FUNC
+
+#endif  // !FANCY_UPSAMPLING
+
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
+  WebPInitUpsamplers();
+  VP8YUVInit();
+#ifdef FANCY_UPSAMPLING
+  return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
+#else
+  return (alpha_is_last ? DualLineSamplerBGRA : DualLineSamplerARGB);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
+YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
+YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
+YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
+YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
+
+#undef YUV444_FUNC
+
+const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
+  Yuv444ToRgb,       // MODE_RGB
+  Yuv444ToRgba,      // MODE_RGBA
+  Yuv444ToBgr,       // MODE_BGR
+  Yuv444ToBgra,      // MODE_BGRA
+  Yuv444ToArgb,      // MODE_ARGB
+  Yuv444ToRgba4444,  // MODE_RGBA_4444
+  Yuv444ToRgb565,    // MODE_RGB_565
+  Yuv444ToRgba,      // MODE_rgbA
+  Yuv444ToBgra,      // MODE_bgrA
+  Yuv444ToArgb,      // MODE_Argb
+  Yuv444ToRgba4444   // MODE_rgbA_4444
+};
+
+//------------------------------------------------------------------------------
+// Main calls
+
+extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersNEON(void);
+
+void WebPInitUpsamplers(void) {
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitUpsamplersSSE2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitUpsamplersNEON();
+    }
+#endif
+  }
+#endif  // FANCY_UPSAMPLING
+}
+
+//------------------------------------------------------------------------------
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@ -0,0 +1,267 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON version of YUV to RGB upsampling functions.
+//
+// Author: mans@mansr.com (Mans Rullgard)
+// Based on SSE code by: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <string.h>
+#include "./neon.h"
+#include "./yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+//-----------------------------------------------------------------------------
+// U/V upsampling
+
+// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
+#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
+  uint8x8_t a = vld1_u8(r1);                                            \
+  uint8x8_t b = vld1_u8(r1 + 1);                                        \
+  uint8x8_t c = vld1_u8(r2);                                            \
+  uint8x8_t d = vld1_u8(r2 + 1);                                        \
+                                                                        \
+  uint16x8_t al = vshll_n_u8(a, 1);                                     \
+  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
+  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
+  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
+                                                                        \
+  uint8x8_t diag1, diag2;                                               \
+  uint16x8_t sl;                                                        \
+                                                                        \
+  /* a + b + c + d */                                                   \
+  sl = vaddl_u8(a,  b);                                                 \
+  sl = vaddw_u8(sl, c);                                                 \
+  sl = vaddw_u8(sl, d);                                                 \
+                                                                        \
+  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
+  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
+                                                                        \
+  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
+  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
+                                                                        \
+  diag2 = vshrn_n_u16(al, 3);                                           \
+  diag1 = vshrn_n_u16(bl, 3);                                           \
+                                                                        \
+  a = vrhadd_u8(a, diag1);                                              \
+  b = vrhadd_u8(b, diag2);                                              \
+  c = vrhadd_u8(c, diag2);                                              \
+  d = vrhadd_u8(d, diag1);                                              \
+                                                                        \
+  {                                                                     \
+    uint8x8x2_t a_b, c_d;                                               \
+    INIT_VECTOR2(a_b, a, b);                                            \
+    INIT_VECTOR2(c_d, c, d);                                            \
+    vst2_u8(out,      a_b);                                             \
+    vst2_u8(out + 32, c_d);                                             \
+  }                                                                     \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
+                             uint8_t *out) {
+  UPSAMPLE_16PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                  \
+  uint8_t r1[9], r2[9];                                                 \
+  memcpy(r1, (tb), (num_pixels));                                       \
+  memcpy(r2, (bb), (num_pixels));                                       \
+  /* replicate last byte */                                             \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
+  Upsample16Pixels(r1, r2, out);                                        \
+}
+
+//-----------------------------------------------------------------------------
+// YUV->RGB conversion
+
+static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
+
+#define v255 vdup_n_u8(255)
+
+#define STORE_Rgb(out, r, g, b) do {                                    \
+  uint8x8x3_t r_g_b;                                                    \
+  INIT_VECTOR3(r_g_b, r, g, b);                                         \
+  vst3_u8(out, r_g_b);                                                  \
+} while (0)
+
+#define STORE_Bgr(out, r, g, b) do {                                    \
+  uint8x8x3_t b_g_r;                                                    \
+  INIT_VECTOR3(b_g_r, b, g, r);                                         \
+  vst3_u8(out, b_g_r);                                                  \
+} while (0)
+
+#define STORE_Rgba(out, r, g, b) do {                                   \
+  uint8x8x4_t r_g_b_v255;                                               \
+  INIT_VECTOR4(r_g_b_v255, r, g, b, v255);                              \
+  vst4_u8(out, r_g_b_v255);                                             \
+} while (0)
+
+#define STORE_Bgra(out, r, g, b) do {                                   \
+  uint8x8x4_t b_g_r_v255;                                               \
+  INIT_VECTOR4(b_g_r_v255, b, g, r, v255);                              \
+  vst4_u8(out, b_g_r_v255);                                             \
+} while (0)
+
+#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
+  int i;                                                                \
+  for (i = 0; i < N; i += 8) {                                          \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
+    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
+    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
+    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
+    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
+    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
+    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
+    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
+    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
+    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
+    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
+    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
+    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
+    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
+    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
+    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
+    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
+    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
+    /* vrshrn_n_s32() already incorporates the rounding constant */     \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
+                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
+                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
+                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
+    STORE_ ## FMT(out + off, y, u, v);                                  \
+  }                                                                     \
+}
+
+#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
+  int i;                                                                \
+  for (i = 0; i < N; i++) {                                             \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    const int y = src_y[(cur_x) + i];                                   \
+    const int u = (src_uv)[i];                                          \
+    const int v = (src_uv)[i + 16];                                     \
+    FUNC(y, u, v, rgb + off);                                           \
+  }                                                                     \
+}
+
+#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
+                      top_dst, bottom_dst, cur_x, len) {                \
+  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
+  if (bottom_y != NULL) {                                               \
+    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
+  }                                                                     \
+}
+
+#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
+                      top_dst, bottom_dst, cur_x, len) {                \
+  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
+  if (bottom_y != NULL) {                                               \
+    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
+  }                                                                     \
+}
+
+#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
+static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
+                      const uint8_t *top_u, const uint8_t *top_v,       \
+                      const uint8_t *cur_u, const uint8_t *cur_v,       \
+                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+  int block;                                                            \
+  /* 16 byte aligned array to cache reconstructed u and v */            \
+  uint8_t uv_buf[2 * 32 + 15];                                          \
+  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
+  const int uv_len = (len + 1) >> 1;                                    \
+  /* 9 pixels must be read-able for each block */                       \
+  const int num_blocks = (uv_len - 1) >> 3;                             \
+  const int leftover = uv_len - num_blocks * 8;                         \
+  const int last_pos = 1 + 16 * num_blocks;                             \
+                                                                        \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
+                                                                        \
+  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
+  const int32x2_t cf32 = vdup_n_s32(kUToB);                             \
+  const uint8x8_t u16  = vdup_n_u8(16);                                 \
+  const uint8x8_t u128 = vdup_n_u8(128);                                \
+                                                                        \
+  /* Treat the first pixel in regular way */                            \
+  assert(top_y != NULL);                                                \
+  {                                                                     \
+    const int u0 = (top_u[0] + u_diag) >> 1;                            \
+    const int v0 = (top_v[0] + v_diag) >> 1;                            \
+    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
+  }                                                                     \
+  if (bottom_y != NULL) {                                               \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
+    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
+  }                                                                     \
+                                                                        \
+  for (block = 0; block < num_blocks; ++block) {                        \
+    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
+    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
+    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
+                  top_dst, bottom_dst, 16 * block + 1, 16);             \
+    top_u += 8;                                                         \
+    cur_u += 8;                                                         \
+    top_v += 8;                                                         \
+    cur_v += 8;                                                         \
+  }                                                                     \
+                                                                        \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
+  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
+                top_dst, bottom_dst, last_pos, len - last_pos);         \
+}
+
+// NEON variants of the fancy upsampler.
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
+
+#endif  // FANCY_UPSAMPLING
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitUpsamplersNEON(void);
+
+#ifdef FANCY_UPSAMPLING
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+void WebPInitUpsamplersNEON(void) {
+#if defined(WEBP_USE_NEON)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
+#endif   // WEBP_USE_NEON
+}
+
+#else
+
+// this empty function is to avoid an empty .o
+void WebPInitUpsamplersNEON(void) {}
+
+#endif  // FANCY_UPSAMPLING
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@ -0,0 +1,214 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+#include "./yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+// u = (9*a + 3*b + 3*c + d + 8) / 16
+//   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+//   = (a + m + 1) / 2
+// where m = (a + 3*b + 3*c + d) / 8
+//         = ((a + b + c + d) / 2 + b + c) / 4
+//
+// Let's say  k = (a + b + c + d) / 4.
+// We can compute k as
+// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+//
+// Then m can be written as
+// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+
+// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+#define GET_M(ij, in, out) do {                                                \
+  const __m128i tmp0 = _mm_avg_epu8(k, (in));     /* (k + in + 1) / 2 */       \
+  const __m128i tmp1 = _mm_and_si128((ij), st);   /* (ij) & (s^t) */           \
+  const __m128i tmp2 = _mm_xor_si128(k, (in));    /* (k^in) */                 \
+  const __m128i tmp3 = _mm_or_si128(tmp1, tmp2);  /* ((ij) & (s^t)) | (k^in) */\
+  const __m128i tmp4 = _mm_and_si128(tmp3, one);  /* & 1 -> lsb_correction */  \
+  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
+} while (0)
+
+// pack and store two alternating pixel rows
+#define PACK_AND_STORE(a, b, da, db, out) do {                                 \
+  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
+  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
+  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
+  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
+  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
+  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
+} while (0)
+
+// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+  const __m128i one = _mm_set1_epi8(1);                                        \
+  const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]);                       \
+  const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]);                       \
+  const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]);                       \
+  const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]);                       \
+                                                                               \
+  const __m128i s = _mm_avg_epu8(a, d);        /* s = (a + d + 1) / 2 */       \
+  const __m128i t = _mm_avg_epu8(b, c);        /* t = (b + c + 1) / 2 */       \
+  const __m128i st = _mm_xor_si128(s, t);      /* st = s^t */                  \
+                                                                               \
+  const __m128i ad = _mm_xor_si128(a, d);      /* ad = a^d */                  \
+  const __m128i bc = _mm_xor_si128(b, c);      /* bc = b^c */                  \
+                                                                               \
+  const __m128i t1 = _mm_or_si128(ad, bc);     /* (a^d) | (b^c) */             \
+  const __m128i t2 = _mm_or_si128(t1, st);     /* (a^d) | (b^c) | (s^t) */     \
+  const __m128i t3 = _mm_and_si128(t2, one);   /* (a^d) | (b^c) | (s^t) & 1 */ \
+  const __m128i t4 = _mm_avg_epu8(s, t);                                       \
+  const __m128i k = _mm_sub_epi8(t4, t3);      /* k = (a + b + c + d) / 4 */   \
+  __m128i diag1, diag2;                                                        \
+                                                                               \
+  GET_M(bc, t, diag1);                  /* diag1 = (a + 3b + 3c + d) / 8 */    \
+  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
+                                                                               \
+  /* pack the alternate pixels */                                              \
+  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
+  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
+                             uint8_t* const out) {
+  UPSAMPLE_32PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                         \
+  uint8_t r1[17], r2[17];                                                      \
+  memcpy(r1, (tb), (num_pixels));                                              \
+  memcpy(r2, (bb), (num_pixels));                                              \
+  /* replicate last byte */                                                    \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
+  /* using the shared function instead of the macro saves ~3k code size */     \
+  Upsample32Pixels(r1, r2, out);                                               \
+}
+
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
+                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
+  int n;                                                                       \
+  for (n = 0; n < (num_pixels); ++n) {                                         \
+    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
+         top_dst + ((cur_x) + n) * XSTEP);                                     \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    for (n = 0; n < (num_pixels); ++n) {                                       \
+      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
+           bottom_dst + ((cur_x) + n) * XSTEP);                                \
+    }                                                                          \
+  }                                                                            \
+}
+
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
+  if (bottom_y != NULL) {                                                      \
+    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
+             bottom_dst + (cur_x) * XSTEP);                                    \
+  }                                                                            \
+} while (0)
+
+#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
+  uint8_t uv_buf[4 * 32 + 15];                                                 \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
+                                                                               \
+  assert(top_y != NULL);                                                       \
+  {   /* Treat the first pixel in regular way */                               \
+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
+    }                                                                          \
+  }                                                                            \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  }                                                                            \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
+                pos, len - pos);                                               \
+  }                                                                            \
+}
+
+// SSE2 variants of the fancy upsampler.
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+
+#undef GET_M
+#undef PACK_AND_STORE
+#undef UPSAMPLE_32PIXELS
+#undef UPSAMPLE_LAST_BLOCK
+#undef CONVERT2RGB
+#undef CONVERT2RGB_32
+#undef SSE2_UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitUpsamplersSSE2(void);
+
+#ifdef FANCY_UPSAMPLING
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+void WebPInitUpsamplersSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8YUVInitSSE2();
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
+#endif   // WEBP_USE_SSE2
+}
+
+#else
+
+// this empty function is to avoid an empty .o
+void WebPInitUpsamplersSSE2(void) {}
+
+#endif  // FANCY_UPSAMPLING
--- a/Show More
+++ b/Show More