update ChangeLog

Change-Id: I60c273c650a305fe36564ccc5fb1c8d7ea18118f
update NEWS
2025-07-15 21:39:59 +02:00 · 2015-03-04 11:30:23 -08:00 · 2015-03-03 19:19:50 -08:00 · 2015-03-03 19:05:40 -08:00 · 2015-03-03 17:53:49 -08:00 · 2015-03-03 17:53:49 -08:00
174 changed files with 29603 additions and 9433 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -2,3 +2,4 @@
 .gitignore export-ignore
 .mailmap export-ignore
 *.pdf -text -diff
 *.ppm -text -diff
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,10 @@
 *.l[ao]
-*.o
+*.[ao]
 *.pc
 .deps
 .libs
 /aclocal.m4
 /ar-lib
 /autom4te.cache
 /compile
 /config.*
@ -13,10 +15,15 @@
 /libtool
 /ltmain.sh
 /missing
 /mkinstalldirs
 /stamp-h1
 Makefile
 Makefile.in
-examples/[cd]webp
+examples/[cdv]webp
 examples/gif2webp
 examples/webpmux
 src/webp/config.h*
 src/webp/stamp-h1
 /output
 /doc/output
 *.idb
--- a/.mailmap
+++ b/.mailmap
@ -1,6 +1,8 @@
 <johann.koenig@duck.com> <johannkoenig@google.com>
 Mikołaj Zalewski <mikolajz@google.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 <pascal.massimino@gmail.com> <skal@google.com>
 Vikas Arora <vikasa@google.com>
 <vikasa@google.com> <vikasa@gmail.com>
 <vikasa@google.com> <vikaas.arora@gmail.com>
 <slobodan.prijic@imgtec.com> <Slobodan.Prijic@imgtec.com>
--- a/9
+++ b/9
@ -1,16 +1,25 @@
 Contributors:
 - Charles Munger (clm at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Johann (johann dot koenig at duck dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
 - levytamar82 (tamar dot levy at intel dot com)
 - Lou Quillio (louquillio at google dot com)
 - Mans Rullgard (mans at mansr dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Noel Chromium (noel at chromium dot org)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
 - Scott LaVarnway (slavarnway at google dot com)
 - Scott Talbot (s at chikachow dot org)
 - Slobodan Prijic (slobodan dot prijic at imgtec dot com)
 - Somnath Banerjee (somnath dot banerjee at gmail dot com)
 - Timothy Gu (timothygu99 at gmail dot com)
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
--- a/Android.mk
+++ b/Android.mk
@ -1,27 +1,67 @@
 LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
+WEBP_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD
-LOCAL_SRC_FILES := \
+
 ifeq ($(APP_OPTIM),release)
  WEBP_CFLAGS += -finline-functions -ffast-math \
                 -ffunction-sections -fdata-sections
  ifeq ($(findstring clang,$(NDK_TOOLCHAIN_VERSION)),)
    WEBP_CFLAGS += -frename-registers -s
  endif
 endif
 ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  NEON := c.neon
 else
  NEON := c
 endif
 dec_srcs := \
    src/dec/alpha.c \
    src/dec/buffer.c \
    src/dec/frame.c \
    src/dec/idec.c \
    src/dec/io.c \
    src/dec/layer.c \
    src/dec/quant.c \
    src/dec/tree.c \
    src/dec/vp8.c \
    src/dec/vp8l.c \
    src/dec/webp.c \
 demux_srcs := \
    src/demux/demux.c \
 dsp_dec_srcs := \
    src/dsp/alpha_processing.c \
    src/dsp/alpha_processing_sse2.c \
    src/dsp/cpu.c \
    src/dsp/dec.c \
    src/dsp/dec_clip_tables.c \
    src/dsp/dec_mips32.c \
    src/dsp/dec_neon.$(NEON) \
    src/dsp/dec_sse2.c \
    src/dsp/enc.c \
    src/dsp/enc_sse2.c \
    src/dsp/lossless.c \
    src/dsp/lossless_mips32.c \
    src/dsp/lossless_neon.$(NEON) \
    src/dsp/lossless_sse2.c \
    src/dsp/upsampling.c \
    src/dsp/upsampling_neon.$(NEON) \
    src/dsp/upsampling_sse2.c \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
    src/dsp/yuv_sse2.c \
 dsp_enc_srcs := \
    src/dsp/enc.c \
    src/dsp/enc_avx2.c \
    src/dsp/enc_mips32.c \
    src/dsp/enc_neon.$(NEON) \
    src/dsp/enc_sse2.c \
 enc_srcs := \
    src/enc/alpha.c \
    src/enc/analysis.c \
    src/enc/backward_references.c \
@ -31,45 +71,145 @@ LOCAL_SRC_FILES := \
    src/enc/frame.c \
    src/enc/histogram.c \
    src/enc/iterator.c \
    src/enc/layer.c \
    src/enc/picture.c \
    src/enc/picture_csp.c \
    src/enc/picture_psnr.c \
    src/enc/picture_rescale.c \
    src/enc/picture_tools.c \
    src/enc/quant.c \
    src/enc/syntax.c \
    src/enc/token.c \
    src/enc/tree.c \
    src/enc/vp8l.c \
    src/enc/webpenc.c \
 mux_srcs := \
    src/mux/muxedit.c \
    src/mux/muxinternal.c \
    src/mux/muxread.c \
 utils_dec_srcs := \
    src/utils/bit_reader.c \
    src/utils/bit_writer.c \
    src/utils/color_cache.c \
    src/utils/filters.c \
    src/utils/huffman.c \
    src/utils/huffman_encode.c \
    src/utils/quant_levels.c \
    src/utils/quant_levels_dec.c \
    src/utils/random.c \
    src/utils/rescaler.c \
    src/utils/thread.c \
    src/utils/utils.c \
-LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
+utils_enc_srcs := \
-                -DWEBP_USE_THREAD \
+    src/utils/bit_writer.c \
-                -finline-functions -frename-registers -ffast-math \
+    src/utils/huffman_encode.c \
-                -s -fomit-frame-pointer -Isrc/webp
+    src/utils/quant_levels.c \
 ################################################################################
 # libwebpdecoder
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
    $(dec_srcs) \
    $(dsp_dec_srcs) \
    $(utils_dec_srcs) \
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+# prefer arm over thumb mode for performance gains
-  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
+LOCAL_ARM_MODE := arm
-  # instructions to be generated for armv7a code. Instead target the neon code
+
  # specifically.
  LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
 endif
 LOCAL_STATIC_LIBRARIES := cpufeatures
-LOCAL_MODULE:= webp
+LOCAL_MODULE := webpdecoder_static
 include $(BUILD_STATIC_LIBRARY)
 ifeq ($(ENABLE_SHARED),1)
 include $(CLEAR_VARS)
 LOCAL_WHOLE_STATIC_LIBRARIES := webpdecoder_static
 LOCAL_MODULE := webpdecoder
 include $(BUILD_SHARED_LIBRARY)
 endif  # ENABLE_SHARED=1
 ################################################################################
 # libwebp
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
    $(dsp_enc_srcs) \
    $(enc_srcs) \
    $(utils_enc_srcs) \
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
 # prefer arm over thumb mode for performance gains
 LOCAL_ARM_MODE := arm
 LOCAL_WHOLE_STATIC_LIBRARIES := webpdecoder_static
 LOCAL_MODULE := webp
 ifeq ($(ENABLE_SHARED),1)
  include $(BUILD_SHARED_LIBRARY)
 else
  include $(BUILD_STATIC_LIBRARY)
 endif
 ################################################################################
 # libwebpdemux
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := $(demux_srcs)
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
 # prefer arm over thumb mode for performance gains
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := webpdemux
 ifeq ($(ENABLE_SHARED),1)
  LOCAL_SHARED_LIBRARIES := webp
  include $(BUILD_SHARED_LIBRARY)
 else
  LOCAL_STATIC_LIBRARIES := webp
  include $(BUILD_STATIC_LIBRARY)
 endif
 ################################################################################
 # libwebpmux
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := $(mux_srcs)
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
 # prefer arm over thumb mode for performance gains
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := webpmux
 ifeq ($(ENABLE_SHARED),1)
  LOCAL_SHARED_LIBRARIES := webp
  include $(BUILD_SHARED_LIBRARY)
 else
  LOCAL_STATIC_LIBRARIES := webp
  include $(BUILD_STATIC_LIBRARY)
 endif
 ################################################################################
 include $(LOCAL_PATH)/examples/Android.mk
 $(call import-module,android/cpufeatures)
--- a/946
+++ b/946
@ -1,7 +1,939 @@
 569fe57 update NEWS
 bd852f5 bump version to 0.4.3
 2d58b64 WebPPictureRescale: add a note about 0 width/height
 a0d8ca5 examples/Android.mk: add webpmux_example target
 34b1d29 Android.mk: add webpmux target
 7561988 Android.mk: add webpdemux target
 a987576 Android.mk: add webpdecoder{,_static} targets
 a6d4859 Android.mk: split source lists per-directory
 77544d5 fix iOS arm64 build with Xcode 6.3
 6dea157 doc/webp-container-spec: note MSB order for chunk diagrams
 f7cd57b doc/webp-container-spec: cosmetics
 1d6b250 vwebp: clear canvas at the beginning of each loop
 f97b3f8 webp-container-spec: clarify background clear on loop
 4ba83c1 vwebp: remove unnecessary static Help() prototype
 d34e8e3 vwebp/animation: display last frame on end-of-loop
 bbbc524 dec/vp8: clear 'dither_' on skipped blocks
 0339fa2 lossless_neon: enable subtract green for aarch64
 5a0c220 Regression fix for lossless decoding
 6e3a31d wicdec: (msvs) quiet some /analyze warnings
 b49a578 dwebp/WritePNG: mark png variables volatile
 0a4391a dwebp: include setjmp.h w/WEBP_HAVE_PNG
 90f1ec5 dwebp: correct sign in format strings
 b61ce86 VP8LEncodeStream: add an assert
 df1081b dsp/cpu: (msvs) add include for __cpuidex
 39aa055 dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
 f814f42 dsp/cpu: add include for _xgetbv() w/MSVS
 8508ab9 cpu: fix AVX2 detection for gcc/clang targets
 5769623 fix handling of zero-sized partition #0 corner case
 b2e71a9 make the 'last_cpuinfo_used' variable names unique
 1273e84 add -Wformat-nonliteral and -Wformat-security
 3ae78eb multi-thread fix: lock each entry points with a static var
 5c1eeda webp-container-spec: remove references to fragments
 c5ceea4 enc_neon: fix building with non-Xcode clang (iOS)
 d0859d6 iosbuild: add x64_64 simulator support
 046732c WebPEncode: Support encoding same pic twice (even if modified)
 4426f50 webp/types.h: use inline for clang++/-std=c++11
 e297fc7 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
 855fe43 Makefile.vc: add a 'legacy' RTLIBCFG option
 b7eb6d5 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
 5691bdd gif2webp: Handle frames with odd offsets + disposal to background.
 8301da1 stopwatch.h: fix includes
 6a2209a update ChangeLog (tag: v0.4.2, origin/0.4.2, 0.4.2)
 36cad6a bit_reader.h: cosmetics: fix a typo
 e2ecae6 enc_mips32: workaround gcc-4.9 bug
 243e68d update ChangeLog (tag: v0.4.2-rc2)
 eec5f5f enc/vp8enci.h: update version number
 0c1b98d update NEWS
 69b0fc9 update AUTHORS
 857578a bump version to 0.4.2
 9129deb restore encode API compatibility
 f17b95e AssignSegments: quiet -Warray-bounds warning
 9c56c8a enc_neon: initialize vectors w/vdup_n_u32
 a008902 iosbuild: cleanup
 cc6de53 iosbuild: output autoconf req. on failure
 740d765 iosbuild: make iOS 6 the minimum requirement
 403023f iobuild.sh: only install .h files in Headers
 b65727b Premultiply with alpha during U/V downsampling
 8de0deb gif2webp: Background color correction
 f8b7d94 Amend the lossless spec according to issue #205, #206 and #224
 9102a7b Add a WebPExtractAlpha function to dsp
 e407b5d webpmux: simplify InitializeConfig()
 3e70e64 webpmux: fix indent
 be38f1a webpmux: fix exit status on numeric value parse error
 94dadcb webpmux: fix loop_count range check
 40b3a61 examples: warn on invalid numeric parameters
 b7d209a gif2webp: Handle frames with missing  graphic control extension
 bf0eb74 configure: simplify libpng-config invocation
 3740f7d Rectify bug in lossless incremental decoding.
 3ab0a37 make VP8LSetBitPos() set br->eos_ flag
 2e4312b Lossless decoding: fix eos_ flag condition
 e6609ac fix erroneous dec->status_ setting
 5692eae add a fallback to ALPHA_NO_COMPRESSION
 6ecd5bf ExUtilReadFromStdin: (windows) open stdin in bin mode
 4206ac6 webpmux: (windows) open stdout in binary mode
 d40e885 cwebp: (windows) open stdout in binary mode
 4aaf463 example_util: add ExUtilSetBinaryMode
 4c82ff7 webpmux man page: Clarify some title, descriptions and examples
 23d4fb3 dsp/lossless: workaround gcc-4.9 bug on arm
 5af7719 dsp.h: collect gcc/clang version test macros
 90d1124 enc_neon: enable QuantizeBlock for aarch64
 ee78e78 SmartRGBYUV: fix odd-width problem with pixel replication
 c9ac204 fix some MSVC64 warning about float conversion
 f4497a1 cpu: check for _MSC_VER before using msvc inline asm
 e2159fd faster RGB->YUV conversion function (~7% speedup)
 21abaa0 Add smart RGB->YUV conversion option -pre 4
 1a161e2 configure: add work around for gcc-4.9 aarch64 bug
 55b10de MIPS: mips32r2: added optimization for BSwap32
 76d2192 Update PATENTS to reflect s/VP8/WebM/g
 29a9db1 MIPS: detect mips32r6 and disable mips32r1 code
 245c4a6 Correctly use the AC_CANONICAL_* macros
 40aa8b6 cosmetics
 2ddcca5 cosmetics: remove some extraneous 'extern's
 f40dd7c vp8enci.h: cosmetics: fix '*' placement
 4610c9c bit_writer: cosmetics: rename kFlush() -> Flush()
 fc3c175 dsp: detect mips64 & disable mips32 code
 c1a7955 cwebp.1: restore quality description
 57a7e73 correct alpha_dithering_strength ABI check
 6c83157 correct WebPMemoryWriterClear ABI check
 8af2771 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1)
 f59c0b4 iosbuild.sh: specify optimization flags
 8d34ea3 update ChangeLog (tag: v0.4.1-rc1)
 dbc3da6 makefile.unix: add vwebp.1 to the dist target
 89a7c83 update ChangeLog
 ffe67ee Merge "update NEWS for the next release" into 0.4.1
 2def1fe gif2webp: dust up the help message
 fb668d7 remove -noalphadither option from README/vwebp.1
 e49f693 update NEWS for the next release
 cd01358 Merge "update AUTHORS" into 0.4.1
 268d01e update AUTHORS
 85213b9 bump version to 0.4.1
 695f80a Merge "restore mux API compatibility" into 0.4.1
 862d296 restore mux API compatibility
 8f6f8c5 remove the !WEBP_REFERENCE_IMPLEMENTATION tweak in Put8x8uv
 d713a69 Merge changes If4debc15,I437a5d5f into 0.4.1
 c2fc52e restore encode API compatibility
 793368e restore decode API compatibility
 b8984f3 gif2webp: fix compile with giflib 5.1.0
 222f9b1 gif2webp: simplify giflib version checking
 d2cc61b Extend MakeARGB32() to accept Alpha channel.
 4595b62 Merge "use explicit size of kErrorMessages[] arrays"
 157de01 Merge "Actuate memory stats for PRINT_MEMORY_INFO"
 fbda2f4 JPEG decoder: delay conversion to YUV to WebPEncode() call
 0b747b1 use explicit size of kErrorMessages[] arrays
 3398d81 Actuate memory stats for PRINT_MEMORY_INFO
 6f3202b Merge "move WebPPictureInit to picture.c"
 6c347bb move WebPPictureInit to picture.c
 fb3acf1 fix configure message for multi-thread
 40b086f configure: check for _beginthreadex
 1549d62 reorder the YUVA->ARGB and ARGB->YUVA functions correctly
 c6461bf Merge "extract colorspace code from picture.c into picture_csp.c"
 736f2a1 extract colorspace code from picture.c into picture_csp.c
 645daa0 Merge "configure: check for -Wformat-security"
 abafed8 configure: check for -Wformat-security
 fbadb48 split monolithic picture.c into picture_{tools,psnr,rescale}.c
 c76f07e dec_neon/TransformAC3: initialize vector w/vcreate
 bb4fc05 gif2webp: Allow single-frame animations
 46fd44c thread: remove harmless race on status_ in End()
 5a1a726 Merge "configure: check for __builtin_bswapXX()"
 6781423 configure: check for __builtin_bswapXX()
 6450c48 configure: fix iOS builds
 6422e68 VP8LFillBitWindow: enable fast path for 32-bit builds
 4f7f52b VP8LFillBitWindow: respect WEBP_FORCE_ALIGNED
 e458bad endian_inl.h: implement htoleXX with BSwapXX
 f2664d1 endian_inl.h: add BSwap16
 6fbf534 Merge "configure: add --enable-aligned"
 dc0f479 configure: add --enable-aligned
 9cc69e2 Merge "configure: support WIC + OpenGL under mingw64"
 257adfb remove experimental YUV444 YUV422 and YUV400 code
 10f4257 configure: support WIC + OpenGL under mingw64
 380cca4 configure.ac: add AC_C_BIGENDIAN
 ee70a90 endian_inl.h: add BSwap64
 47779d4 endian_inl.h: add BSwap32
 d5104b1 utils: add endian_inl.h
 58ab622 Merge "make alpha-detection loop in IsKeyFrame() in good x/y order"
 9d56290 make alpha-detection loop in IsKeyFrame() in good x/y order
 516971b lossless: Remove unaligned read warning
 b8b596f Merge "configure.ac: add an autoconf version prerequisite"
 34b02f8 configure.ac: add an autoconf version prerequisite
 e59f536 neon: normalize vdup_n_* usage
 6ee7160 Merge changes I0da7b3d3,Idad2f278,I4accc305
 abc02f2 Merge "fix (uncompiled) typo"
 bc03670 neon: add INIT_VECTOR4
 6c1c632 neon: add INIT_VECTOR3
 dc7687e neon: add INIT_VECTOR2
 4536e7c add WebPMuxSetCanvasSize() to the mux API
 824eab1 fix (uncompiled) typo
 1f3e5f1 remove unused 'shift' argument and QFIX2 define
 8e86705 Merge "VP8LoadNewBytes: use __builtin_bswap32 if available"
 1b6a263 Merge "Fix handling of weird GIF with canvas dimension 0x0"
 1da3d46 VP8LoadNewBytes: use __builtin_bswap32 if available
 1582e40 Fix handling of weird GIF with canvas dimension 0x0
 b8811da Merge "rename interface -> winterface"
 db8b8b5 Fix logic in the GIF LOOP-detection parsing
 25aaddc rename interface -> winterface
 5584d9d make WebPSetWorkerInterface() check its arguments
 a9ef7ef Merge "cosmetics: update thread.h comments"
 c6af999 Merge "dust up the help message"
 0a8b886 dust up the help message
 a9cf319 cosmetics: update thread.h comments
 27bfeee QuantizeBlock SSE2 Optimization:
 2bc0dc3 Merge "webpmux: warn when odd frame offsets are used"
 3114ebe Merge changes Id8edd3c1,Id418eb96,Ide05e3be
 c072663 webpmux: warn when odd frame offsets are used
 c5c6b40 Merge "add alpha dithering for lossy"
 d514678 examples/Android.mk: add cwebp
 ca0fa7c Android.mk: move dwebp to examples/Android.mk
 73d8fca Android.mk: add ENABLE_SHARED flag
 6e93317 muxread: fix out of bounds read
 8b0f6a4 Makefile.vc: fix CFLAGS assignment w/HAVE_AVX2=1
 bbe32df add alpha dithering for lossy
 7902076 Merge "make error-code reporting consistent upon malloc failure"
 77bf441 make error-code reporting consistent upon malloc failure
 7a93c00 **/Makefile.am: remove unused AM_CPPFLAGS
 24e3080 Add an interface abstraction to the WebP worker thread implementation
 d6cd635 Merge "fix orig_rect==NULL case"
 2bfd1ff fix orig_rect==NULL case
 059e21c Merge "configure: move config.h to src/webp/config.h"
 f05fe00 properly report back encoding error code in WebPFrameCacheAddFrame()
 32b3137 configure: move config.h to src/webp/config.h
 90090d9 Merge changes I7c675e51,I84f7d785
 ae7661b makefiles: define WEBP_HAVE_AVX2 when appropriate
 69fce2e remove the special casing for res->first in VP8SetResidualCoeffs
 6e61a3a configure: test for -msse2
 b9d2efc rename upsampling_mips32.c to yuv_mips32.c
 bdfeeba dsp/yuv: move sse2 functions to yuv_sse2.c
 46b32e8 Merge "configure: set WEBP_HAVE_AVX2 when available"
 88305db Merge "VP8RandomBits2: prevent signed int overflow"
 73fee88 VP8RandomBits2: prevent signed int overflow
 db4860b enc_sse2: prevent signed int overflow
 3fdaf4d Merge "real fix for longjmp warning"
 385e334 real fix for longjmp warning
 230a055 configure: set WEBP_HAVE_AVX2 when available
 a2ac8a4 restore original value_/range_ field order
 5e2ee56 Merge "remove libwebpdspdecode dep on libwebpdsp_avx2"
 61362db remove libwebpdspdecode dep on libwebpdsp_avx2
 42c447a Merge "lossy bit-reader clean-up:"
 479ffd8 Merge "remove unused #include's"
 9754d39 Merge "strong filtering speed-up (~2-3% x86, ~1-2% for NEON)"
 158aff9 remove unused #include's
 09545ee lossy bit-reader clean-up:
 ea8b0a1 strong filtering speed-up (~2-3% x86, ~1-2% for NEON)
 6679f89 Optimize VP8SetResidualCoeffs.
 ac591cf fix for gcc-4.9 warnings about longjmp + local variables
 4dfa86b dsp/cpu: NaCl has no support for xgetbv
 4c39869 Merge "cwebp: fallback to native webp decode in WIC builds"
 33aa497 Merge "cwebp: add some missing newlines in longhelp output"
 c9b340a fix missing WebPInitAlphaProcessing call for premultiplied colorspace output
 57897ba Merge "lossless_neon: use vcreate_*() where appropriate"
 6aa4777 Merge "(enc|dec)_neon: use vcreate_*() where appropriate"
 0d346e4 Always reinit VP8TransformWHT instead of hard-coding
 7d039fc cwebp: fallback to native webp decode in WIC builds
 d471f42 cwebp: add some missing newlines in longhelp output
 bf0e003 lossless_neon: use vcreate_*() where appropriate
 9251c2f (enc|dec)_neon: use vcreate_*() where appropriate
 399b916 lossy decoding: correct alpha-rescaling for YUVA format
 78c12ed Merge "Makefile.vc: add rudimentary avx2 support"
 dc5b122 try to remove the spurious warning for static analysis
 ddfefd6 Makefile.vc: add rudimentary avx2 support
 a891164 Merge "simplify VP8LInitBitReader()"
 fdbcd44 simplify VP8LInitBitReader()
 7c00428 makefile.unix: add rudimentary avx2 support
 515e35c Merge "add stub dsp/enc_avx2.c"
 a05dc14 SSE2: yuv->rgb speed-up for point-sampling
 178e9a6 add stub dsp/enc_avx2.c
 1b99c09 Merge "configure: add a test for -mavx2"
 fe72807 configure: add a test for -mavx2
 e46a247 cpu: fix check for __cpuidex availability
 176fda2 fix the bit-writer for lossless in 32bit mode
 541784c dsp.h: add a check for AVX2 / define WEBP_USE_AVX2
 bdb151e dsp/cpu: add AVX2 detection
 ab9f2f8 Merge "revamp the point-sampling functions by processing a full plane"
 a2f8b28 revamp the point-sampling functions by processing a full plane
 ef07602 use decoder's DSP functions for autofilter
 2b5cb32 Merge "dsp/cpu: add AVX detection"
 df08e67 dsp/cpu: add AVX detection
 e2f405c Merge "clean-up and slight speed-up in-loop filtering SSE2"
 f60957b clean-up and slight speed-up in-loop filtering SSE2
 9fc3ae4 .gitattributes: treat .ppm as binary
 3da924b Merge "dsp/WEBP_USE_NEON: test for __aarch64__"
 c716449 Android.mk: always include *_neon.c in the build
 a577b23 dsp/WEBP_USE_NEON: test for __aarch64__
 54bfffc move RemapBitReader() from idec.c to bit_reader code
 34168ec Merge "remove all unused layer code"
 f1e7717 remove all unused layer code
 b0757db Code cleanup for VP8LGetHistoImageSymbols.
 5fe628d make the token page size be variable instead of fixed 8192
 f948d08 memory debug: allow setting pre-defined malloc failure points
 ca3d746 use block-based allocation for backward refs storage, and free-lists
 1ba61b0 enable NEON intrinsics in aarch64 builds
 b9d2bb6 dsp/neon.h: coalesce intrinsics-related defines
 b5c7525 iosbuild: add support for iOSv7/aarch64
 9383afd Reduce number of memory allocations while decoding lossless.
 888e63e Merge "dsp/lossless: prevent signed int overflow in left shift ops"
 8137f3e Merge "instrument memory allocation routines for debugging"
 2aa1873 instrument memory allocation routines for debugging
 d3bcf72 Don't allocate VP8LHashChain, but treat like automatic object
 bd6b861 dsp/lossless: prevent signed int overflow in left shift ops
 b7f19b8 Merge "dec/vp8l: prevent signed int overflow in left shift ops"
 29059d5 Merge "remove some uint64_t casts and use."
 e69a1df dec/vp8l: prevent signed int overflow in left shift ops
 cf5eb8a remove some uint64_t casts and use.
 38e2db3 MIPS: MIPS32r1: Added optimization for HistogramAdd.
 e0609ad dwebp: fix exit code on webp load failure
 bbd358a Merge "example_util.h: avoid forward declaring enums"
 8955da2 example_util.h: avoid forward declaring enums
 6d6865f Added SSE2 variants for Average2/3/4
 b3a616b make HistogramAdd() a pointer in dsp
 c8bbb63 dec_neon: relocate some inline-asm defines
 4e393bb dec_neon: enable intrinsics-only functions
 ba99a92 dec_neon: use positive tests for USE_INTRINSICS
 69058ff Merge "example_util: add ExUtilDecodeWebPIncremental"
 a7828e8 dec_neon: make WORK_AROUND_GCC conditional on version
 3f3d717 Merge "enc_neon: enable intrinsics-only functions"
 de3cb6c Merge "move LOCAL_GCC_VERSION def to dsp.h"
 1b2fe14 example_util: add ExUtilDecodeWebPIncremental
 ca49e7a Merge "enc_neon: move Transpose4x4 to dsp/neon.h"
 ad900ab Merge "fix warning about size_t -> int conversion"
 4825b43 fix warning about size_t -> int conversion
 42b35e0 enc_neon: enable intrinsics-only functions
 f937e01 move LOCAL_GCC_VERSION def to dsp.h
 5e1a17e enc_neon: move Transpose4x4 to dsp/neon.h
 c7b92a5 dec_neon: (WORK_AROUND_GCC) delete unused Load4x8
 8e5f90b Merge "make ExUtilLoadWebP() accept NULL bitstream param."
 05d4c1b Merge "cwebp: add webpdec"
 ddeb6ac cwebp: add webpdec
 35d7d09 Merge "Reduce memory footprint for encoding WebP lossless."
 0b89610 Reduce memory footprint for encoding WebP lossless.
 f0b65c9 make ExUtilLoadWebP() accept NULL bitstream param.
 9c0a60c Merge "dwebp: move webp decoding to example_util"
 1d62acf MIPS: MIPS32r1: Added optimization for HuffmanCost functions.
 4a0e739 dwebp: move webp decoding to example_util
 c022046 Merge "Bugfix: Incremental decode of lossy-alpha"
 8c7cd72 Bugfix: Incremental decode of lossy-alpha
 7955152 MIPS: fix error with number of registers.
 b1dabe3 Merge "Move the HuffmanCost() function to dsp lib"
 75b1200 Move the HuffmanCost() function to dsp lib
 2772b8b MIPS: fix assembler error revealed by clang's debug build
 6653b60 enc_mips32: fix unused symbol warning in debug
 8dec120 enc_mips32: disable ITransform(One) in debug builds
 98519dd enc_neon: convert Disto4x4 to intrinsics
 fe9317c cosmetics:
 953b074 enc_neon: cosmetics
 a9fc697 Merge "WIP: extract the float-calculation of HuffmanCost from loop"
 3f84b52 Merge "replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)"
 4ae0533 MIPS: MIPS32r1: Added optimizations for ExtraCost functions.
 b30a04c WIP: extract the float-calculation of HuffmanCost from loop
 a8fe8ce Merge "NEON intrinsics version of CollectHistogram"
 95203d2 NEON intrinsics version of CollectHistogram
 7ca2e74 replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)
 41c6efb fix lossless_neon.c
 8ff96a0 NEON intrinsics version of FTransform
 0214f4a Merge "MIPS: MIPS32r1: Added optimizations for FastLog2"
 baabf1e MIPS: MIPS32r1: Added optimizations for FastLog2
 3d49871 NEON functions for lossless coding
 3fe0291 MIPS: MIPS32r1: Added optimizations for SSE functions.
 c503b48 Merge "fix the gcc-4.6.0 bug by implementing alternative method"
 abe6f48 fix the gcc-4.6.0 bug by implementing alternative method
 5598bde enc_mips32.c: fix file mode
 2b1b4d5 MIPS: MIPS32r1: Add optimization for GetResidualCost
 f0a1f3c Merge "MIPS: MIPS32r1: Added optimization for FTransform"
 7231f61 MIPS: MIPS32r1: Added optimization for FTransform
 869eaf6  ~30% encoding speedup: use NEON for QuantizeBlock()
 f758af6 enc_neon: convert FTransformWHT to intrinsics
 7dad095 MIPS: MIPS32r1: Added optimization for Disto4x4 (TTransform)
 2298d5f MIPS: MIPS32r1: Added optimization for QuantizeBlock
 e88150c Merge "MIPS: MIPS32r1: Add optimization for ITransform"
 de693f2 lossless_neon: disable VP8LConvert* functions
 4143332 NEON intrinsics for encoding
 0ca2914 MIPS: MIPS32r1: Add optimization for ITransform
 71bca5e dec_neon: use vst_lane instead of vget_lane
 bf06105 Intrinsics NEON version of TransformOne
 19c6f1b Merge "dec_neon: use vld?_lane instead of vset?_lane"
 7a94c0c upsampling_neon: drop NEON suffix from local functions
 d14669c upsampling_sse2: drop SSE2 suffix from local functions
 2ca42a4 enc_sse2: drop SSE2 suffix from local functions
 d038e61 dec_sse2: drop SSE2 suffix from local functions
 fa52d75 dec_neon: use vld?_lane instead of vset?_lane
 c520e77 cosmetic: fix long line
 4b0f2da Merge "add intrinsics NEON code for chroma strong-filtering"
 e351ec0 add intrinsics NEON code for chroma strong-filtering
 aaf734b Merge "Add SSE2 version of forward cross-color transform"
 c90a902 Add SSE2 version of forward cross-color transform
 bc374ff Use histogram_bits to initalize transform_bits.
 2132992 Merge "Add strong filtering intrinsics (inner and outer edges)"
 5fbff3a Add strong filtering intrinsics (inner and outer edges)
 d4813f0 Add SSE2 function for Inverse Cross-color Transform
 2602956 dec_neon: add strong loopfilter intrinsics
 cca7d7e Merge "add intrinsics version of SimpleHFilter16NEON()"
 1a05dfa windows: fix dll builds
 d6c50d8 Merge "add some colorspace conversion functions in NEON"
 4fd7c82 SSE2 variants of Subtract-Green: Rectify loop condition
 97e5fac add some colorspace conversion functions in NEON
 b9a7a45 add intrinsics version of SimpleHFilter16NEON()
 daccbf4 add light filtering NEON intrinsics
 af44460 fix typo in STORE_WHT
 6af6b8e Tune HistogramCombineBin for large images.
 af93bdd use WebPSafe[CM]alloc/WebPSafeFree instead of [cm]alloc/free
 51f406a lossless_sse2: relocate VP8LDspInitSSE2 proto
 0f4f721 separate SSE2 lossless functions into its own file
 514fc25 VP8LConvertFromBGRA: use conversion function pointers
 6d2f352 dsp/dec: TransformDCUV: use VP8TransformDC
 defc8e1 Merge "fix out-of-bound read during alpha-plane decoding"
 fbed364 Merge "dsp: reuse wht transform from dec in encoder"
 d846708 Merge "Add SSE2 version of ARGB -> BGR/RGB/... conversion functions"
 207d03b fix out-of-bound read during alpha-plane decoding
 d1b33ad 2-5% faster trellis with clang/MacOS (and ~2-3% on ARM)
 369c26d Add SSE2 version of ARGB -> BGR/RGB/... conversion functions
 df230f2 dsp: reuse wht transform from dec in encoder
 80e218d Android.mk: fix build with APP_ABI=armeabi-v7a-hard
 59daf08 Merge "cosmetics:"
 5362200 cosmetics:
 3e7f34a AssignSegments: quiet array-bounds warning
 3c2ebf5 Merge "UpdateHistogramCost: avoid implicit double->float"
 cf821c8 UpdateHistogramCost: avoid implicit double->float
 312e638 Extend the search space for GetBestGreenRedToBlue
 1c58526 Fix few nits
 fef2270 Optimize and re-structure VP8LGetHistoImageSymbols
 068b14a Optimize lossless decoding.
 5f0cfa8 Do a binary search to get the optimum cache bits.
 24ca367 Merge "allow 'cwebp -o -' to emit output to stdout"
 e12f874 allow 'cwebp -o -' to emit output to stdout
 2bcad89 allow some more stdin/stout I/O
 84ed4b3 fix cwebp.1 typos after patch #69199
 65b99f1 add a -z option to cwebp, and WebPConfigLosslessPreset() function
 3017661 4-5% faster trellis by removing some unneeded calculations.
 687a58e histogram.c: reindent after b33e8a0
 06d456f Merge "~3-4% faster lossless encoding"
 c60de26 ~3-4% faster lossless encoding
 42eb06f Merge "few cosmetics after patch #69079"
 82af826 few cosmetics after patch #69079
 b33e8a0 Refactor code for HistogramCombine.
 ca1bfff Merge "5-10% encoding speedup with faster trellis (-m 6)"
 5aeeb08 5-10% encoding speedup with faster trellis (-m 6)
 82ae1bf cosmetics: normalize VP8GetCPUInfo checks
 e3dd924 Merge "Refactor GetBestPredictorForTile for future tuning."
 206cc1b Refactor GetBestPredictorForTile for future tuning.
 3cb8406 Merge "speed-up trellis quant (~5-10% overall speed-up)"
 b66f222 Merge "lossy encoding: ~3% speed-up"
 4287d0d speed-up trellis quant (~5-10% overall speed-up)
 390c8b3 lossy encoding: ~3% speed-up
 9a463c4 Merge "dec_neon: convert TransformWHT to intrinsics"
 e8605e9 Merge "dec_neon: add ConvertU8ToS16"
 4aa3e41 MIPS: MIPS32r1: rescaler bugfix
 c16cd99 Speed up lossless encoder.
 9d6b5ff dec_neon: convert TransformWHT to intrinsics
 2ff0aae dec_neon: add ConvertU8ToS16
 77a8f91 fix compilation with USE_YUVj flag
 4acbec1 Merge changes I3b240ffb,Ia9370283,Ia2d28728
 2719bb7 dec_neon: TransformAC3: work on packed vectors
 b7b60ca dec_neon: add SaturateAndStore4x4
 b7685d7 Rescale: let ImportRow / ExportRow be pointer-to-function
 e02f16e dec_neon.c: convert TransformDC to intrinsics
 9cba963 add missing file
 8992ddb use static clipping tables
 0235d5e 1-2% faster quantization in SSE2
 b2fbc36 fix VC12-x64 warning
 6e37cb9 Merge "cosmetics: backward_references.c: reindent after a7d2ee3"
 a42ea97 cosmetics: backward_references.c: reindent after a7d2ee3
 6c32744 Merge "fix missing __BIG_ENDIAN__ definition on some platform"
 a8b6aad fix missing __BIG_ENDIAN__ definition on some platform
 fde2904 Increase initial buffer size for VP8L Bit Writer.
 a7d2ee3 Optimize cache estimate logic.
 7fb6095 Merge "dec_neon.c: add TransformAC3"
 bf182e8 VP8LBitWriter: use a bit-accumulator
 3f40b4a Merge "MIPS: MIPS32r1: clang macro warning resolved"
 1684f4e WebP Decoder: Mark some truncated bitstreams as invalid
 acbedac MIPS: MIPS32r1: clang macro warning resolved
 228e487 dec_neon.c: add TransformAC3
 393f89b Android.mk: avoid gcc-specific flags with clang
 32aeaf1 revamp VP8LColorSpaceTransform() a bit
 0c7cc4c Merge "Don't dereference NULL, ensure HashChain fully initialized"
 391316f Don't dereference NULL, ensure HashChain fully initialized
 926ff40 WEBP_SWAP_16BIT_CSP: remove code dup
 1d1cd3b Fix decode bug for rgbA_4444/RGBA_4444 color-modes.
 939e70e update AUTHORS file
 8934a62 cosmetics: *_mips32.c
 dd438c9 MIPS: MIPS32r1: Optimization of some simple point-sampling functions. PATCH [6/6]
 5352091 Added support for calling sampling functions via pointers.
 d16c697 MIPS: MIPS32r1: Optimization of filter functions. PATCH [5/6]
 04336fc MIPS: MIPS32r1: Optimization of function TransformOne. PATCH [4/6]
 92d8fc7 MIPS: MIPS32r1: Optimization of function WebPRescalerImportRow. PATCH [3/6]
 bbc23ff parse one row of intra modes altogether
 a2f608f Merge "MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]"
 8823085 MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]
 c5a5b02 decode mt+incremental: fix segfault in debug builds
 9882b2f always use fast-analysis for all methods.
 000adac Merge "autoconf: update ax_pthread.m4"
 2d2fc37 update .gitignore
 5bf4255 Merge "Make it possible to avoid automagic dependencies"
 c1cb193 disable NEON for arm64 platform
 73a304e Make it possible to avoid automagic dependencies
 4d493f8 MIPS: MIPS32r1: Decoder bit reader function optimized. PATCH [1/6]
 c741183 make WebPCleanupTransparentArea work with argb picture
 5da1855 add a decoding option to flip image vertically
 00c3c4e Merge "add man/vwebp.1"
 2c6bb42 add man/vwebp.1
 ea59a8e Merge "Merge tag 'v0.4.0'"
 7574bed fix comments related to array sizes
 0b5a90f dwebp.1: fix option formatting
 effcb0f Merge tag 'v0.4.0'
 7c76255 autoconf: update ax_pthread.m4
 fff2a11 make -short work with -print_ssim, -print_psnr, etc.
 68e7901 update ChangeLog (tag: v0.4.0-rc1, tag: v0.4.0, origin/0.4.0, 0.4.0)
 256e433 update NEWS description with new general features
 2962534 Merge "gif2webp: don't use C99 %zu" into 0.4.0
 3b9f9dd gif2webp: don't use C99 %zu
 b5b2e3c cwebp: fix metadata output w/lossy+alpha
 ad26df1 makefile.unix: clean up libgif2webp_util.a
 c3b4557 update Changelog
 ca84112 Merge "bump version to 0.4.0" into 0.4.0
 8c524db bump version to 0.4.0
 eec2398 update AUTHORS & .mailmap
 b9bbf6a update NEWS for 0.4.0
 c72e081 Merge "dec/webp.c: don't wait for data before reporting w/h"
 5ad6531 dec/frame.c: fix formatting
 f7fc4bc dec/webp.c: don't wait for data before reporting w/h
 66a32af Merge "NEON speed up"
 26d842e NEON speed up
 f307f98 Merge "webpmux: let -- stop parameter parsing"
 fe051da Merge "README: add a section on gif2webp"
 6fd2bd6 Merge "manpage pedantry"
 4af1900 README: add a section on gif2webp
 6f36ade manpage pedantry
 f9016cb README: update dwebp options
 b4fa0a4 webpmux: let -- stop parameter parsing
 a9a20ac gif2webp: Add a multi-threaded encode option
 495bef4 fix bug in TrellisQuantize
 605a712 simplify __cplusplus ifdef
 33109f9 Merge "drop: ifdef __cplusplus checks from C files"
 7f9de0b Merge changes I994a5587,I8467bb71,I13b50688,I1e2c9c7b
 5459030 gif2webp: let -- stop parameter parsing
 a4b0aa0 vwebp: let -- stop parameter parsing
 98af68f cwebp: let -- stop parameter parsing
 a33831e dwebp: let -- stop parameter parsing
 3630124 add some checks on error paths
 ce4c713 Merge "autoconf: add --disable-wic"
 5227d99 drop: ifdef __cplusplus checks from C files
 f645355 dwebp.1: fix typo
 f91034f Merge "cwebp: print metadata stats when no output file is given"
 d493455 gif2webp: Backward compatibility for giflib version <= 4.1.3
 4c617d3 gif2webp: Disable output of ICC profile by default
 73b731f introduce a special quantization function for WHT
 41c0cc4 Make Forward WHT transform use 32bit fixed-point calculation
 a3359f5 Only compute quantization params once
 7049043 cwebp: print metadata stats when no output file is given
 d513bb6 * fix off-by-one zthresh calculation * remove the sharpening for non luma-AC coeffs * adjust the bias a little bit to compensate for this
 ad9dec0 Merge "cosmetics: dwebp: fix local function name format"
 f737f03 Merge "dwebp: remove a dead store"
 3c3a70d Merge "makefile.unix: install binaries in $(DESTDIR)/bin/"
 150b655 Merge "Android.mk: add some release compile flags"
 dbebd33 cosmetics: dwebp: fix local function name format
 2774995 dwebp: remove a dead store
 a01e04f autoconf: add --disable-wic
 5009b22 makefile.unix: install binaries in $(DESTDIR)/bin/
 bab30fc Merge "fix -print_psnr / ssim options"
 ebef7fb fix -print_psnr / ssim options
 cb63785 Merge "fix bug due to overzealous check in WebPPictureYUVAToARGB()"
 8189885 Merge "EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE"
 4ad7d33 Android.mk: add some release compile flags
 c12e236 cosmetics: fix a few typos
 6f10403 fix bug due to overzealous check in WebPPictureYUVAToARGB()
 3f6c35c EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE
 cc55790 Merge changes I8bb7a4dc,I2c180051,I021a014f,I8a224a62
 c536afb Merge "cosmetics: fix some typos"
 cbdd3e6 add a -dither dithering option to the decoder
 e812401 Updated iosbuild.sh for XCode 5.x
 4931c32 cosmetics: fix some typos
 05aacf7 mux: add some missing casts
 617d934 enc/vp8l: add a missing cast
 46db286 idec: add some missing casts
 b524e33 ErrorStatusLossless: correct return type
 cb261f7 fix a descaling bug for vertical/horizontal U/V interpolation
 bcb3955 Merge changes I48968468,I181bc736
 73f5213 gif2webp: Add a mixed compression mode
 6198715 demux: split chunk parsing from ParseVP8X
 d2e3f4e demux: add a tail pointer for chunks
 87cffcc demux: cosmetics: s/has_frames/is_animation/
 e18e667 demux: strictly enforce the animation flag
 c4f39f4 demux: cosmetics: remove a useless break
 61cb884 demux: (non-exp) fail if the fragmented flag is set
 ff379db few % speedup of lossless encoding
 df3649a remove all disabled code related to P-frames
 6d0cb3d Merge "gif2webp: kmin = 0 should suppress key-frame addition."
 3655598 gif2webp: kmin = 0 should suppress key-frame addition.
 7708e60 Merge "detect flatness in blocks and favor DC prediction"
 06b1503 Merge "add comment about the kLevelsFromDelta[][] LUT generation"
 5935259 add comment about the kLevelsFromDelta[][] LUT generation
 e3312ea detect flatness in blocks and favor DC prediction
 ebc9b1e Merge "VPLBitReader bugfix: Catch error if bit_pos > LBITS too."
 96ad0e0 VPLBitReader bugfix: Catch error if bit_pos > LBITS too.
 a014e9c tune quantization biases toward higher precision
 1e89861 add helpful PrintBlockInfo() function
 596a6d7 make use of 'extern' consistent in function declarations
 c8d48c6 Merge "extract random utils to their own file util/random.[ch]"
 98aa33c extract random utils to their own file util/random.[ch]
 432a723 Merge "swig: add basic go bindings"
 fab618b Merge "rename libwebp.i -> libwebp.swig"
 e4e7fcd swig: add basic go bindings
 d340872 Merge "fast auto-determined filtering strength"
 f8bfd5c fast auto-determined filtering strength
 ac0bf95 small clean-up in ExpandMatrix()
 1939607 rename libwebp.i -> libwebp.swig
 43148b6 filtering: precompute ilimit and hev_threshold
 18f992e simplify f_inner calculation a little
 241d11f add missing const
 86c0031 add a 'format' field to WebPBitstreamFeatures
 dde91fd Demux: Correct the extended format validation
 5d6c5bd add entry for '-resize' option in cwebp's man
 7c098d1 Use some gamma-curve range compression when computing U/V average
 0b2b050 Use deterministic random-dithering during RGB->YUV conversion
 8a2fa09 Add a second multi-thread method
 7d6f2da Merge "up to 20% faster multi-threaded decoding"
 266f63e Merge "libwebp.jar: build w/Java 1.6 for Android compat"
 0532149 up to 20% faster multi-threaded decoding
 38efdc2 Simplify the gif2webp tool: move the optimization details to util
 de89951 libwebp.jar: build w/Java 1.6 for Android compat
 cb22155 Decode a full row of bitstream before reconstructing
 dca8a4d Merge "NEON/simple loopfilter: avoid q4-q7 registers"
 9e84d90 Merge "NEON/TransformWHT: avoid q4-q7 registers"
 fc10249 NEON/simple loopfilter: avoid q4-q7 registers
 2f09d63 NEON/TransformWHT: avoid q4-q7 registers
 77585a2 Merge "use a macrofunc for setting NzCoeffs bits"
 d155507 Merge "use HINT_GRAPH as image_hint for gif source"
 9c56164 Merge "only print GIF_DISPOSE_WARNING once"
 0587986 use HINT_GRAPH as image_hint for gif source
 0b28d7a use a macrofunc for setting NzCoeffs bits
 f9bbc2a Special-case sparse transform
 0012519 gif2webp: detect and flatten uniformly similar blocks
 0deaf0f only print GIF_DISPOSE_WARNING once
 6a8c0eb Merge "small optimization in segment-smoothing loop"
 f7146bc small optimization in segment-smoothing loop
 5a7533c small gif2webp fix
 4df0c89 Merge changes Ic697660c,I27285521
 5b2e6bd Android.mk: add a dwebp target
 f910a84 Android.mk: update build flags
 63f9aba special-case WHT transform when there's only DC
 80911ae Merge "7-8% faster decoding by rewriting GetCoeffs()"
 606c430 gif2webp: Improved compression for lossy animated WebP
 fb887f7 gif2webp: Different kmin/kmax defaults for lossy and lossless
 2a98136 7-8% faster decoding by rewriting GetCoeffs()
 92d47e4 improve VP8L signature detection by checking the version bits too
 5cd43e4 Add -incremental option to dwebp
 54b8e3f webpmux: DisplayInfo(): remove unnecessary error checks.
 40ae352 fix memleak in WebPIDelete()
 d966265 mux.h doc: WebPMuxGetFrame() can return WEBP_MUX_MEMORY_ERROR too.
 0e6747f webpmux -info: display dimensions and has_alpha per frame
 d78a82c Sanity check for underflow
 8498f4b Merge "remove -Wshadow warnings"
 e89c6fc Avoid a potential memleak
 3ebe175 Merge "break down the proba 4D-array into some handy structs"
 6a44550 break down the proba 4D-array into some handy structs
 2f5e893 remove -Wshadow warnings
 bf3a29b Merge "add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags"
 2b0a759 Merge "fix some warnings from static analysis"
 22dd07c mux.h: Some doc corrections
 79ff034 add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags
 d51f45f fix some warnings from static analysis
 d134307 fix conversion warning on MSVC
 d538cea gif2webp: Support a 'min' and 'max'  key frame interval
 80b54e1 allow search with token buffer loop and fix PARTITION0 problem
 b7d4e04 add VP8EstimateTokenSize()
 10fddf5 enc/quant.c: silence a warning
 399cd45 Merge "fix compile error on ARM/gcc"
 9f24519 encoder: misc rate-related fixes
 c663bb2 Merge "simplify VP8IteratorSaveBoundary() arg passing"
 fa46b31 Demux.h: Correct a method name reference
 f8398c9 fix compile error on ARM/gcc
 f691f0e simplify VP8IteratorSaveBoundary() arg passing
 42542be up to 6% faster encoding with clang compiler
 93402f0 multi-threaded segment analysis
 7e2d659 Merge "remove the PACK() bit-packing tricks"
 c13fecf remove the PACK() bit-packing tricks
 2fd091c Merge "use NULL for lf_stats_ testing, not bool"
 b11c9d6 dwebp: use default dct_method
 4bb8465 Merge "(de)mux.h: wrap pseudo-code in /* */"
 cfb56b1 make -pass option work with token buffers
 5416aab (de)mux.h: wrap pseudo-code in /* */
 35dba33 use NULL for lf_stats_ testing, not bool
 733a7fa enc->Iterator memory cleanup
 e81fac8 Add support for "no blend" in webpmux binary
 3b80bc4 gif2webp: Separate out each step into a method
 bef7e9c Add doc precision about demux object keeping pointers to data.
 61405a1 dwebp: enable stdout output with WIC
 6eabb88 Merge "Animated WebP: add "do no blend" option to spec"
 be20dec fix compilation for BITS 24
 e58cc13 Merge "dwebp: s/unsigned char/uint8_t/"
 72501d4 dwebp: s/unsigned char/uint8_t/
 2c9633e Merge "gif2webp: Insert independent frames at regular intervals."
 f0d6a14 gif2webp: Insert independent frames at regular intervals.
 b25a6fb yuv.h: fix indent
 ede3602 Merge "cosmetics: fix indent"
 3a65122 dwebp: fix stdout related output
 388a724 cosmetics: fix indent
 4c7322c Merge "dsp: msvc compatibility"
 d50c7e3 Merge "5-7% faster SSE2 versions of YUV->RGB conversion functions"
 b8ab784 Merge "simplify upsampler calls: only allow 'bottom' to be NULL"
 df6cebf 5-7% faster SSE2 versions of YUV->RGB conversion functions
 ad6ac32 simplify upsampler calls: only allow 'bottom' to be NULL
 a5e8afa output to stdout if file name is "-"
 f358450 dsp: msvc compatibility
 43a7c8e Merge "cosmetics"
 4c5f19c Merge "bit_reader.h: cosmetics"
 f72fab7 cosmetics
 14dd5e7 fix const-ness
 b20aec4 Merge "Support for 'do not blend' option in vwebp"
 dcf6522 Support for 'do not blend' option in vwebp
 d5bad03 Animated WebP: add "do no blend" option to spec
 a2f5f73 Merge "Support for "Do not blend" in mux and demux libraries"
 e081f2f Pack code & extra_bits to Struct (VP8LPrefixCode).
 6284854 Support for "Do not blend" in mux and demux libraries
 f486aaa Merge "slightly faster ParseIntraMode"
 d171863 slightly faster ParseIntraMode
 3ceca8a bit_reader.h: cosmetics
 69257f7 Create LUT for PrefixEncode.
 988b708 add WebPWorkerExecute() for convenient bypass
 06e2498 Merge "VP8EncIterator clean-up"
 de4d4ad VP8EncIterator clean-up
 7bbe952 Merge "cosmetics: thread.c: drop a redundant comment"
 da41148 cosmetics: thread.c: drop a redundant comment
 feb4b6e thread.h: #ifdef when checking WEBP_USE_THREAD
 8924a3a thread.c: drop WebPWorker prefix from static funcs
 1aed8f2 Merge "fix indent"
 4038ed1 fix indent
 1693fd9 Demux: A new state WEBP_DEMUX_PARSE_ERROR
 8dcae8b fix rescaling-with-alpha inaccuracy
 11249ab Merge changes I9b4dc36c,I4e0eef4d
 52508a1 Mux: support parsing unknown chunks within a frame/fragment.
 05db057 WebPMuxSetChunk: remove unused variable
 8ba1bf6 Stricter check for presence of alpha when writing lossless images
 a03c351 Demux: WebPIterator now also denotes if the frame has alpha.
 6df743a Decoder: handle fragments case correctly too.
 faa4b07 Support for unknown chunks in mux library
 7d60bbc Speed up HashChainFindCopy function.
 6674014 Speedup Alpha plane encoding.
 b7346a1 0.1 % speedup to decoding
 c606182 webp-container-spec: Tighten language added by last
 a34a502 pngdec: output error messages from libpng
 e84c625 Merge "Detect canvas and image size mismatch in decoder."
 f626fe2 Detect canvas and image size mismatch in decoder.
 f5fbdee demux: stricter image bounds check
 30c8158 add extra assert in Huffman decode code
 8967b9f SSE2 for lossless decoding (critical) functions.
 699d80e Jump-lookup for Huffman coding
 c34307a fix some VS9 warnings about type conversion
 eeada35 pngdec: add missing include
 54b6510 gif2webp: If aligning to even offsets, extra pixels should be transparent
 0bcf5ce Merge "remove a malloc() in case we're using only FILTER_NONE for alpha"
 2c07143 remove a malloc() in case we're using only FILTER_NONE for alpha
 a4d5f59 Faster lossless decoding
 fd53bb7 Merge "alternate LUT-base reverse-bits code"
 d1c166e Merge "Container spec: a clarification on background color."
 fdb9177 Rename a method
 5e96753 Container spec: a clarification on background color.
 30e77d0 Merge branch '0.3.0'
 1b631e2 alternate LUT-base reverse-bits code
 24cc307 ~20% faster lossless decoding
 313d853 Speedup for decoding lossless WebP photographs:
 24ee098 change the bytes_per_pixels_ field into more evocative use_8b_decode
 2a04b03 update ChangeLog (tag: v0.3.1-rc2, tag: v0.3.1)
 7288950 Regression fix for alpha channels using color cache:
 2e377b5 wicdec: silence a format warning
 ad9e42a muxedit: silence some uninitialized warnings
 3307c16 Don't set alpha-channel to 0xff for alpha->green uplift
 5130770 Merge "wicdec: silence a format warning"
 a37eff4 Regression fix for alpha channels using color cache:
 241cf99 Merge "muxedit: silence some uninitialized warnings"
 c8f9c84 Regression fix for alpha unfiltering:
 14cd5c6 muxedit: silence some uninitialized warnings
 a368db8 dec/vp8l: quiet vs9 x64 type conversion warning
 ffae9f3 wicdec: silence a format warning
 8cf0701 Alpha encoding: never filter in case of NO_COMPRESSION
 825e73b update ChangeLog (tag: v0.3.1-rc1)
 abf6f69 update NEWS
 5a92c1a bump version to 0.3.1
 86daf77 store top Y/U/V samples in packed fashion
 67bc353 Revert "add WebPBlendAlpha() function to blend colors against background"
 068db59 Intertwined decoding of alpha and RGB
 38cc011 Simplify forward-WHT + SSE2 version
 3fa595a Support decoding upto given row in DECODE_DATA_FUNC
 520f005 DequantizeLevels(): Add 'row' and 'num_rows' args
 47374b8 Alpha unfilter for given set of rows
 f32097e probe input file and quick-check for WebP format.
 a2aed1d configure: improve gl/glut library test
 c7e89cb update copyright text
 a00380d configure: remove use of AS_VAR_APPEND
 a94a88d fix EXIF parsing in PNG
 a71e5d8 add doc precision for WebPPictureCopy() and WebPPictureView()
 8287012 remove datatype qualifier for vmnv
 e190843 fix a memory leak in gif2webp
 0b18b9e fix two minor memory leaks in webpmux
 db5095d remove some cruft from swig/libwebp.jar
 850e956 README: update swig notes
 bddd9b0 swig/python: add minimal documentation
 d573a8d swig: add python encode support
 6b93187 swig/java: reduce wrapper function code duplication
 6fe536f swig/java: rework uint8_t typemap
 a2ea464 Fix the bug in ApplyPalette.
 7bb28d2 webp/lossless: fix big endian BGRA output
 f036d4b Speed up ApplyPalette for ARGB pixels.
 8112c8c remove some warnings:
 cc128e0 Further reduce memory to decode lossy+alpha images
 07db70d fix for big-endian
 eda8a7d gif2webp: Fix signed/unsigned comparison mismatch
 31f346f Makefile.vc: fix libwebpdemux dll variable typo
 6c76d28 swig: add python (decode) support
 b4f5bb6 swig: cosmetics
 498d4dd WebP-Lossless encoding improvements.
 26e7244 swig: ifdef some Java specific code
 8ecec68 configure: add warning related flags
 e676b04 configure: add GLUT detection; build vwebp
 b0ffc43 Alpha decoding: significantly reduce memory usage
 20aa7a8 configure: add --enable-everything
 b8307cc configure.ac: add some helper macros
 980e7ae Remove the gcc compilation comments
 7f25ff9 gif2webp: Fix ICC and XMP support
 d8e5321 Add missing name to AUTHORS
 11edf5e Demux: Fix a potential memleak
 c7b9218 don't forward declare enums
 7a650c6 prevent signed int overflow in left shift ops
 31bea32 add precision about dynamic output reallocation with IDecoder
 c22877f Add incremental support for extended format files
 5051245 Makefile.vc: have 'all' target build everything
 8191dec Makefile.vc: flags cleanup
 b9d7473 Makefile.vc: drop /FD flag
 5568dbc update gitignore
 f4c7b65 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
 1fb04be pngdec: Avoid a double-free.
 dcbb1ca add WebPBlendAlpha() function to blend colors against background
 bc9f5fb configure.ac: add AM_PROG_AR for automake >= 1.12
 bf867bf Tuned cross_color parameter (step) for lower qual
 90e2ec5 Merge "probe input file and quick-check for WebP format."
 7180d7f Merge "update copyright text"
 830f72b probe input file and quick-check for WebP format.
 2ccf58d configure: improve gl/glut library test
 d640614 update copyright text
 c2113ad Merge "configure: remove use of AS_VAR_APPEND"
 9326a56 configure: remove use of AS_VAR_APPEND
 ea63d61 fix a type warning on VS9 x86
 bec1109 fix EXIF parsing in PNG
 b6e65f3 Merge "fix warnings for vs9 x64"
 438946d fix warnings for vs9 x64
 f4710e3 collect macroblock reconstruction data in VP8MBData struct
 23d28e2 add doc precision for WebPPictureCopy() and WebPPictureView()
 518f2cd cosmetics: gif2webp: fix indent
 af358e6 Merge "remove datatype qualifier for vmnv"
 3fe9163 remove datatype qualifier for vmnv
 764fdff fix a memory leak in gif2webp
 3e59a74 fix two minor memory leaks in webpmux
 47b9862 Merge "README: update swig notes"
 325d15f remove some cruft from swig/libwebp.jar
 4a7627c README: update swig notes
 5da81e3 Merge "swig/python: add minimal documentation"
 f39e08f Merge "swig: add python encode support"
 6ca4a3e Merge "swig/java: reduce wrapper function code duplication"
 8f8702b Merge "swig/java: rework uint8_t typemap"
 91413be reduce memory for VP8MB and remove bitfields use
 7413394 Fix the memory leak in ApplyFilters.
 2053c2c simplify the alpha-filter testing loop
 825b64d swig/python: add minimal documentation
 14677e1 swig: add python encode support
 a5c297c swig/java: reduce wrapper function code duplication
 ad4a367 swig/java: rework uint8_t typemap
 0d25876 use uint8_t for inv_palette[]
 afa3450 Fix the bug in ApplyPalette.
 2d6ac42 Merge "webp/lossless: fix big endian BGRA output"
 2ca8396 webp/lossless: fix big endian BGRA output
 742110c Speed up ApplyPalette for ARGB pixels.
 2451e47 misc code cleanup
 83db404 Merge "swig: add python (decode) support"
 eeeea8b Merge "swig: cosmetics"
 d5f9b8f Merge "libwebp: fix vp8 encoder mem alloc offsetting"
 d8edd83 libwebp: fix vp8 encoder mem alloc offsetting
 8983b83 remove use of bit-fields in VP8FInfo
 87a4fca remove some warnings:
 ba8f74e Merge "fix for big-endian"
 a65067f Merge "Further reduce memory to decode lossy+alpha images"
 64c8448 Further reduce memory to decode lossy+alpha images
 332130b Mux: make a few methods static
 4437061 fix for big-endian
 5199eab Merge "add uncompressed TIFF output support"
 a3aede9 add uncompressed TIFF output support
 f975b67 Merge "gif2webp: Fix signed/unsigned comparison mismatch"
 5fbc734 Merge "GetFeatures: Detect invalid VP8X/VP8/VP8L data"
 d5060c8 Merge "mux.h: A comment fix + some consistency fixes"
 352d0de GetFeatures: Detect invalid VP8X/VP8/VP8L data
 3ef79fe Cosmetic: "width * height"
 043e1ae gif2webp: Fix signed/unsigned comparison mismatch
 5818cff mux.h: A comment fix + some consistency fixes
 1153f88 Merge "swig: ifdef some Java specific code"
 3eeedae Makefile.vc: fix libwebpdemux dll variable typo
 f980faf swig: add python (decode) support
 7f5f42b swig: cosmetics
 8eae188 WebP-Lossless encoding improvements.
 c7247c4 swig: ifdef some Java specific code
 4cb234d Merge "Mux: make ValidateForSingleImage() method static"
 ed6f530 Merge "Add GetCanvasSize() method to mux"
 1d530c9 Mux: make ValidateForSingleImage() method static
 bba4c2b configure: add warning related flags
 fffefd1 Add GetCanvasSize() method to mux
 732da8d Merge "configure: add GLUT detection; build vwebp"
 0e513f7 configure: add GLUT detection; build vwebp
 55d1c15 Merge "Alpha decoding: significantly reduce memory usage"
 13d99fb Merge "configure: add --enable-everything"
 2bf698f Merge "configure.ac: add some helper macros"
 edccd19 Alpha decoding: significantly reduce memory usage
 3cafcc9 configure: add --enable-everything
 4ef1447 configure.ac: add some helper macros
 a4e1cdb Remove the gcc compilation comments
 6393fe4 Cosmetic fixes
 9c4ce97 Simplify forward-WHT + SSE2 version
 878b9da fix missed optim
 0004617 VP8GetInfo(): Check for zero width or height.
 9bf3129 align VP8Encoder::nz_ allocation
 5da165c fix CheckMode() signature
 0ece07d Merge "explicitly pad bitfields to 32-bits"
 9dbc9d1 explicitly pad bitfields to 32-bits
 5369a80 Merge "prevent signed int overflow in left shift ops"
 70e3971 Merge "cosmetics: remove unnecessary ';'s"
 d3136ce Merge "don't forward declare enums"
 b26e5ad gif2webp: Fix ICC and XMP support
 46089b2 Add missing name to AUTHORS
 94328d6 Demux: Fix a potential memleak
 96e948d don't forward declare enums
 f4f9088 prevent signed int overflow in left shift ops
 0261545 cosmetics: remove unnecessary ';'s
 7ebdf11 Merge "Fix few missing comparisons to NULL"
 1579989 Fix few missing comparisons to NULL
 ea1b21c Cleaned up VP8GetHeaders() so that it parses only frame header
 b66caee dwebp: add support for BMP output
 ff885bf add precision about dynamic output reallocation with IDecoder
 79241d5 Merge "Makefile.vc: have 'all' target build everything"
 ac1c729 Merge "Makefile.vc: flags cleanup"
 118a055 Merge "Makefile.vc: drop /FD flag"
 ecad010 Merge "update gitignore"
 a681b4f Rename PRE_VP8 state to WEBP_HEADER
 ead4d47 Add incremental support for extended format files
 69d0f92 Makefile.vc: have 'all' target build everything
 5296749 Makefile.vc: flags cleanup
 c61baf0 Makefile.vc: drop /FD flag
 3a15125 update gitignore
 5167ca4 Merge "WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded."
 67708d6 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
 b68912a pngdec: Avoid a double-free.
 82abbe1 Merge "configure.ac: add AM_PROG_AR for automake >= 1.12"
 e7d9548 add WebPBlendAlpha() function to blend colors against background
 ed4dc71 configure.ac: add AM_PROG_AR for automake >= 1.12
 df4a406 Merge branch '0.3.0'
 1e0d4b8 Update ChangeLog (tag: v0.3.0-rc7, tag: v0.3.0)
 d52b405 Cosmetic fixes
 6cb4a61 misc style fix
 68111ab add missing YUVA->ARGB automatic conversion in WebPEncode()
 e9a7990 Cosmetic fixes
 403bfe8 Container spec: Clarify frame disposal
 2aaa423 Merge "add missing YUVA->ARGB automatic conversion in WebPEncode()"
 07d87bd add missing YUVA->ARGB automatic conversion in WebPEncode()
 142c462 misc style fix
 3e7a13a Merge "Container spec: clarify the background color field" into 0.3.0
 14af774 container doc: add a note about the 'ANMF' payload
 cc635ef Container spec: clarify the background color field
@ -18,7 +950,7 @@ a5ebd14 gif2webp: Bgcolor fix for a special case
 3c8eb9a fix bad saturation order in QuantizeBlock
 04c7a2e vwebp/animation: fix background dispose
 81a5069 Makefile.vc: fix dynamic builds
-5f25c39 update ChangeLog
+5f25c39 update ChangeLog (tag: v0.3.0-rc6)
 14d42af examples: don't use C99 %zu
 5ccf1fe update ChangeLog
 2560c24 update NEWS
@ -324,7 +1256,7 @@ a61a824 Merge "Add NULL check in chunk APIs"
 a077072 mux struct naming
 6c66dde Merge "Tune Lossless encoder"
 ab5ea21 Tune Lossless encoder
-74fefc8 Update ChangeLog (v0.2.1, origin/0.2.0)
+74fefc8 Update ChangeLog (tag: v0.2.1, origin/0.2.0, 0.2.0)
 92f8059 Rename some chunks:
 3bb4bbe Merge "Mux API change:"
 d0c79f0 Mux API change:
@ -394,7 +1326,7 @@ c7eb457 make VP8DspInitNEON() public
 ab3234a Create WebPMuxFrameInfo struct for Mux APIs
 e3990fd Alignment fixes
 e55fbd6 Merge branch '0.2.0'
-4238bc0 Update ChangeLog (v0.2.0)
+4238bc0 Update ChangeLog (tag: v0.2.0)
 c655380 dec/io.c: cosmetics
 fe1958f RGBA4444: harmonize lossless/lossy alpha values
 681cb30 fix RGBA4444 output w/fancy upsampling
@ -405,7 +1337,7 @@ f56e98f Alignment fix
 a0a4885 Lossless decoder fix for a special transform order
 62dd9bb Update encoding heuristic w.r.t palette colors.
 6f4272b remove unused ApplyInverseTransform()
-93bf0fa Update ChangeLog (v0.2.0-rc1)
+93bf0fa Update ChangeLog (tag: v0.2.0-rc1)
 5934fc5 update AUTHORS
 014a711 update NEWS
 43b0d61 add support for ARGB -> YUVA conversion for lossless decoder
@ -448,7 +1380,7 @@ cbee59e Merge commit 'v0.1.99'
 3bc3f7c Merge "dwebp: add PAM output support" into 0.2.0
 d919ed0 dwebp: add PAM output support
 85e215d README/manpages/configure: update website link
-c3a207b Update ChangeLog (v0.1.99)
+c3a207b Update ChangeLog (tag: v0.1.99)
 d1fd782 Merge "add extra precision about default values and behaviour" into 0.2.0
 efc826e add extra precision about default values and behaviour
 9f29635 header/doc clean up
@ -1073,7 +2005,7 @@ f3bf4c7 Added Mux Container Spec & README for MUX-API.
 9f761cf Changed function signature for WebPMuxCreate
 5f31b5e Merge "Add Mux library for manipulating WebP container."
 2315785 Add Mux library for manipulating WebP container.
-7e198ab update ChangeLog (v0.1.3)
+7e198ab update ChangeLog (tag: v0.1.3)
 dfc9c1e Harmonize the dates
 28ad70c Fix PNG decoding bug
 846e93c Update AUTHORS & add .mailmap
@ -1214,7 +2146,7 @@ cfbf88a add SSE2 functions. ~2x faster encoding on average.
 e7ff3f9 merge two ITransforms together when applicable and change the TTransform to return the sum directly.
 ca55413 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
 8aa50ef fix some 'man' typos
-d3f3bdd update ChangeLog (v0.1.2)
+d3f3bdd update ChangeLog (tag: v0.1.2)
 d7e9a69 update contributor list
 261abb8 add a 'superclean' section
 276ae82 Remove files not mean to be in git, and update .gitignore
--- a/Makefile.vc
+++ b/Makefile.vc
@ -24,18 +24,18 @@ PLATFORM_LDFLAGS = /SAFESEH
 #############################################################
 ## Nothing more to do below this line!
-MT         = mt.exe
+NOLOGO     = /nologo
-CCNODBG    = cl.exe /nologo /O2 /DNDEBUG
+CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
-CCDEBUG    = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
+CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc /nologo /W3 /EHsc /FD /c /GS
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
-LNKDLL     = link.exe /DLL
+LNKDLL     = link.exe /DLL $(NOLOGO)
-LNKLIB     = link.exe /lib
+LNKEXE     = link.exe $(NOLOGO)
-LNKEXE     = link.exe
+LNKLIB     = lib.exe $(NOLOGO)
-LFLAGS     = /nologo /machine:$(ARCH)
+MT         = mt.exe $(NOLOGO)
 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
@ -44,11 +44,21 @@ OUTDIR = ..\obj\
 OUTDIR = $(OBJDIR)
 !ENDIF
 !IF "$(HAVE_AVX2)" == "1"
 CFLAGS = $(CFLAGS) /DWEBP_HAVE_AVX2
 AVX2_FLAGS = /arch:AVX2
 !ENDIF
 ##############################################################
 # Runtime library configuration
 !IF "$(RTLIBCFG)" == "static"
 RTLIB  = /MT
 RTLIBD = /MTd
 !ELSE IF "$(RTLIBCFG)" == "legacy"
 RTLIBCFG = static
 RTLIB  = /MT
 RTLIBD = /MTd
 CFLAGS = $(CFLAGS) /GS- /arch:IA32
 !ELSE
 RTLIB   = /MD
 RTLIBD  = /MDd
@ -108,7 +118,7 @@ CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
 LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME)_dll.lib
 LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME)_dll.lib
 LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME)_dll.lib
-LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPMDEMUX_BASENAME)_dll.lib
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME)_dll.lib
 LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME)_dll.pdb
 CFGSET = TRUE
 !ENDIF
@ -130,9 +140,11 @@ CFGSET = TRUE
 !MESSAGE -  clean                         - perform a clean for CFG
 !MESSAGE -  experimental                  - build CFG with experimental
 !MESSAGE .                                  features enabled.
-!MESSAGE - (empty) or all                 - build all targets for CFG
+!MESSAGE - (empty)                        - build libwebp-based targets for CFG
 !MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
 !MESSAGE   'legacy' will produce a Windows 2000 compatible library.
 !MESSAGE OBJDIR is the path where you like to build (obj, bins, etc.),
 !MESSAGE   defaults to ..\obj
@ -155,7 +167,6 @@ DEC_OBJS = \
    $(DIROBJ)\dec\frame.obj \
    $(DIROBJ)\dec\idec.obj \
    $(DIROBJ)\dec\io.obj \
    $(DIROBJ)\dec\layer.obj \
    $(DIROBJ)\dec\quant.obj \
    $(DIROBJ)\dec\tree.obj \
    $(DIROBJ)\dec\vp8.obj \
@ -166,18 +177,29 @@ DEMUX_OBJS = \
    $(DIROBJ)\demux\demux.obj \
 DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\alpha_processing.obj \
    $(DIROBJ)\dsp\alpha_processing_sse2.obj \
    $(DIROBJ)\dsp\cpu.obj \
    $(DIROBJ)\dsp\dec.obj \
    $(DIROBJ)\dsp\dec_clip_tables.obj \
    $(DIROBJ)\dsp\dec_mips32.obj \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
    $(DIROBJ)\dsp\lossless_mips32.obj \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
    $(DIROBJ)\dsp\upsampling.obj \
    $(DIROBJ)\dsp\upsampling_neon.obj \
    $(DIROBJ)\dsp\upsampling_sse2.obj \
    $(DIROBJ)\dsp\yuv.obj \
    $(DIROBJ)\dsp\yuv_mips32.obj \
    $(DIROBJ)\dsp\yuv_sse2.obj \
 DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\enc.obj \
    $(DIROBJ)\dsp\enc_avx2.obj \
    $(DIROBJ)\dsp\enc_mips32.obj \
    $(DIROBJ)\dsp\enc_neon.obj \
    $(DIROBJ)\dsp\enc_sse2.obj \
@ -186,6 +208,7 @@ EX_FORMAT_DEC_OBJS = \
    $(DIROBJ)\examples\metadata.obj \
    $(DIROBJ)\examples\pngdec.obj \
    $(DIROBJ)\examples\tiffdec.obj \
    $(DIROBJ)\examples\webpdec.obj \
    $(DIROBJ)\examples\wicdec.obj \
 EX_UTIL_OBJS = \
@ -201,8 +224,11 @@ ENC_OBJS = \
    $(DIROBJ)\enc\frame.obj \
    $(DIROBJ)\enc\histogram.obj \
    $(DIROBJ)\enc\iterator.obj \
    $(DIROBJ)\enc\layer.obj \
    $(DIROBJ)\enc\picture.obj \
    $(DIROBJ)\enc\picture_csp.obj \
    $(DIROBJ)\enc\picture_psnr.obj \
    $(DIROBJ)\enc\picture_rescale.obj \
    $(DIROBJ)\enc\picture_tools.obj \
    $(DIROBJ)\enc\quant.obj \
    $(DIROBJ)\enc\syntax.obj \
    $(DIROBJ)\enc\token.obj \
@ -222,6 +248,7 @@ UTILS_DEC_OBJS = \
    $(DIROBJ)\utils\huffman.obj \
    $(DIROBJ)\utils\quant_levels_dec.obj \
    $(DIROBJ)\utils\rescaler.obj \
    $(DIROBJ)\utils\random.obj \
    $(DIROBJ)\utils\thread.obj \
    $(DIROBJ)\utils\utils.obj \
@ -238,8 +265,10 @@ LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)
 OUT_LIBS = $(LIBWEBPDECODER) $(LIBWEBP)
 OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
 EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe
-all: $(OUT_LIBS) $(OUT_EXAMPLES)
+ex: $(OUT_LIBS) $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
 $(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(EX_FORMAT_DEC_OBJS)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj
 $(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj
@ -247,6 +276,7 @@ $(DIRBIN)\vwebp.exe: $(EX_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(LIBWEBP)
 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
 $(EX_UTIL_OBJS) $(EX_FORMAT_DEC_OBJS): $(OUTPUT_DIRS)
 experimental:
 	$(MAKE) /f Makefile.vc \
@ -278,7 +308,7 @@ clean::
 	@-erase /s $(DIROBJ)\$(DLLC) $(DIROBJ)\$(DLLINC) 2> NUL
 !ELSE
 $(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
-	$(LNKLIB) /out:$@ $(LFLAGS) $**
+	$(LNKLIB) /out:$@ $**
 	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
 !ENDIF
@ -303,6 +333,12 @@ $(DIROBJ)\$(DLLC): $(DIROBJ)\$(DLLINC)
 	@echo } >> $@
 .SUFFIXES: .c .obj .res .exe
 # File-specific flag builds. Note batch rules take precedence over wildcards,
 # so for now name each file individually.
 $(DIROBJ)\dsp\enc_avx2.obj: src\dsp\enc_avx2.c
 	$(CC) $(CFLAGS) $(AVX2_FLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ \
 	  src\dsp\$(@B).c
 # Batch rules
 {examples}.c{$(DIROBJ)\examples}.obj::
 	$(CC) $(CFLAGS) /Fd$(DIROBJ)\examples\ /Fo$(DIROBJ)\examples\ $<
 {src\dec}.c{$(DIROBJ)\dec}.obj::
--- a/47
+++ b/47
@ -1,3 +1,50 @@
 - 3/3/15: version 0.4.3
  This is a binary compatible release.
  * Android / gcc / iOS / MSVS build fixes and improvements
  * lossless decode fix (issue #239 -- since 0.4.0)
  * documentation / vwebp updates for animation
  * multi-threading fix (issue #234)
 - 10/13/14: version 0.4.2
  This is a binary compatible release.
  * Android / gcc build fixes
  * (Windows) fix reading from stdin and writing to stdout
  * gif2webp: miscellaneous fixes
  * fix 'alpha-leak' with lossy compression (issue #220)
  * the lossless bitstream spec has been amended to reflect the current code
 - 7/24/14: version 0.4.1
  This is a binary compatible release.
  * AArch64 (arm64) & MIPS support/optimizations
  * NEON assembly additions:
    - ~25% faster lossy decode / encode (-m 4)
    - ~10% faster lossless decode
    - ~5-10% faster lossless encode (-m 3/4)
  * dwebp/vwebp can read from stdin
  * cwebp/gif2webp can write to stdout
  * cwebp can read webp files; useful if storing sources as webp lossless
 - 12/19/13: version 0.4.0
  * improved gif2webp tool
  * numerous fixes, compression improvement and speed-up
  * dither option added to decoder (dwebp -dither 50 ...)
  * improved multi-threaded modes (-mt option)
  * improved filtering strength determination
  * New function: WebPMuxGetCanvasSize
  * BMP and TIFF format output added to 'dwebp'
  * Significant memory reduction for decoding lossy images with alpha.
  * Intertwined decoding of RGB and alpha for a shorter
    time-to-first-decoded-pixel.
  * WebPIterator has a new member 'has_alpha' denoting whether the frame
    contains transparency.
  * Container spec amended with new 'blending method' for animation.
 - 6/13/13: version 0.3.1
  This is a binary compatible release.
  * Add incremental decoding support for images containing ALPH and ICCP chunks.
  * Python bindings via swig for the simple encode/decode interfaces similar to
    Java.
 - 3/20/13: version 0.3.0
  This is a binary compatible release.
  * WebPINewRGB/WebPINewYUVA accept being passed a NULL output buffer
--- a/39
+++ b/39
@ -1,22 +1,23 @@
 Additional IP Rights Grant (Patents)
 ------------------------------------
-"This implementation" means the copyrightable works distributed by
+"These implementations" means the copyrightable works that implement the WebM
-Google as part of the WebM Project.
+codecs distributed by Google as part of the WebM Project.
-Google hereby grants to you a perpetual, worldwide, non-exclusive,
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
-no-charge, royalty-free, irrevocable (except as stated in this section)
+royalty-free, irrevocable (except as stated in this section) patent license to
-patent license to make, have made, use, offer to sell, sell, import,
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
-transfer, and otherwise run, modify and propagate the contents of this
+run, modify and propagate the contents of these implementations of WebM, where
-implementation of VP8, where such license applies only to those patent
+such license applies only to those patent claims, both currently owned by
-claims, both currently owned by Google and acquired in the future,
+Google and acquired in the future, licensable by Google that are necessarily
-licensable by Google that are necessarily infringed by this
+infringed by these implementations of WebM. This grant does not include claims
-implementation of VP8. This grant does not include claims that would be
+that would be infringed only as a consequence of further modification of these
-infringed only as a consequence of further modification of this
+implementations. If you or your agent or exclusive licensee institute or order
-implementation. If you or your agent or exclusive licensee institute or
+or agree to the institution of patent litigation or any other patent
-order or agree to the institution of patent litigation against any
+enforcement activity against any entity (including a cross-claim or
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
-that this implementation of VP8 or any code incorporated within this
+or any code incorporated within any of these implementations of WebM
-implementation of VP8 constitutes direct or contributory patent
+constitutes direct or contributory patent infringement, or inducement of
-infringement, or inducement of patent infringement, then any patent
+patent infringement, then any patent rights granted to you under this License
-rights granted to you under this License for this implementation of VP8
+for these implementations of WebM shall terminate as of the date such
-shall terminate as of the date such litigation is filed.
+litigation is filed.
--- a/167
+++ b/167
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.3.0
+               \____/____/\_____/_____/____/v0.4.3
 Description:
 ============
@ -80,8 +80,8 @@ more options.
 SWIG bindings:
 --------------
-To generate language bindings from swig/libwebp.i swig-1.3
+To generate language bindings from swig/libwebp.swig at least swig-1.3
-(http://www.swig.org) is required. 2.0 may work, but has not been tested.
+(http://www.swig.org) is required.
 Currently the following functions are mapped:
 Decode:
@ -104,12 +104,20 @@ Encode:
  WebPEncodeLosslessRGB
  WebPEncodeLosslessBGR
 See swig/README for more detailed build instructions.
 Java bindings:
 To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
 is necessary for enum support. The output is intended to be a shared object /
 DLL that can be loaded via System.loadLibrary("webp_jni").
 Python bindings:
 To build the swig-generated Python extension code at least Python 2.6 is
 required. Python < 2.6 may build with some minor changes to libwebp.swig or the
 generated code, but is untested.
 Encoding tool:
 ==============
@ -132,28 +140,30 @@ A longer list of options is available using the -longhelp command line flag:
 Usage:
 cwebp [-preset <...>] [options] in_file [-o out_file]
-If input size (-s) for an image is not specified, it is assumed to be a PNG,
+If input size (-s) for an image is not specified, it is
-JPEG or TIFF file.
+assumed to be a PNG, JPEG, TIFF or WebP file.
-options:
+
 Options:
  -h / -help  ............ short help
  -H / -longhelp  ........ long help
  -q <float> ............. quality factor (0:small..100:big)
-  -alpha_q <int> ......... Transparency-compression quality (0..100).
+  -alpha_q <int> ......... transparency-compression quality (0..100)
-  -preset <string> ....... Preset setting, one of:
+  -preset <string> ....... preset setting, one of:
                            default, photo, picture,
                            drawing, icon, text
-     -preset must come first, as it overwrites other parameters.
+     -preset must come first, as it overwrites other parameters
  -m <int> ............... compression method (0=fast, 6=slowest)
  -segments <int> ........ number of segments to use (1..4)
-  -size <int> ............ Target size (in bytes)
+  -size <int> ............ target size (in bytes)
-  -psnr <float> .......... Target PSNR (in dB. typically: 42)
+  -psnr <float> .......... target PSNR (in dB. typically: 42)
-  -s <int> <int> ......... Input size (width x height) for YUV
+  -s <int> <int> ......... input size (width x height) for YUV
-  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)
+  -sns <int> ............. spatial noise shaping (0:off, 100:max)
  -f <int> ............... filter strength (0=off..100)
  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
-  -strong ................ use strong filter instead of simple (default).
+  -strong ................ use strong filter instead of simple (default)
-  -nostrong .............. use simple filter instead of strong.
+  -nostrong .............. use simple filter instead of strong
  -partition_limit <int> . limit quality to fit the 512k limit on
                           the first partition (0=no degradation ... 100=full)
  -pass <int> ............ analysis pass number (1..10)
@ -161,37 +171,40 @@ options:
  -resize <w> <h> ........ resize picture (after any cropping)
  -mt .................... use multi-threading if available
  -low_memory ............ reduce memory usage (slower encoding)
-  -map <int> ............. print map of extra info.
+  -map <int> ............. print map of extra info
-  -print_psnr ............ prints averaged PSNR distortion.
+  -print_psnr ............ prints averaged PSNR distortion
-  -print_ssim ............ prints averaged SSIM distortion.
+  -print_ssim ............ prints averaged SSIM distortion
-  -print_lsim ............ prints local-similarity distortion.
+  -print_lsim ............ prints local-similarity distortion
-  -d <file.pgm> .......... dump the compressed output (PGM file).
+  -d <file.pgm> .......... dump the compressed output (PGM file)
-  -alpha_method <int> .... Transparency-compression method (0..1)
+  -alpha_method <int> .... transparency-compression method (0..1)
-  -alpha_filter <string> . predictive filtering for alpha plane.
+  -alpha_filter <string> . predictive filtering for alpha plane,
-                           One of: none, fast (default) or best.
+                           one of: none, fast (default) or best
-  -alpha_cleanup ......... Clean RGB values in transparent area.
+  -alpha_cleanup ......... clean RGB values in transparent area
-  -noalpha ............... discard any transparency information.
+  -blend_alpha <hex> ..... blend colors against background color
-  -lossless .............. Encode image losslessly.
+                           expressed as RGB values written in
-  -hint <string> ......... Specify image characteristics hint.
+                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
-                           One of: photo, picture or graph
+                           green=0xe0 and blue=0xd0
  -noalpha ............... discard any transparency information
  -lossless .............. encode image losslessly
  -hint <string> ......... specify image characteristics hint,
                           one of: photo, picture or graph
  -metadata <string> ..... comma separated list of metadata to
                           copy from the input to the output if present.
                           Valid values: all, none (default), exif, icc, xmp
  -short ................. condense printed message
-  -quiet ................. don't print anything.
+  -quiet ................. don't print anything
-  -version ............... print version number and exit.
+  -version ............... print version number and exit
-  -noasm ................. disable all assembly optimizations.
+  -noasm ................. disable all assembly optimizations
  -v ..................... verbose, e.g. print encoding/decoding times
  -progress .............. report encoding progress
 Experimental Options:
-  -jpeg_like ............. Roughly match expected JPEG size.
+  -jpeg_like ............. roughly match expected JPEG size
-  -af .................... auto-adjust filter strength.
+  -af .................... auto-adjust filter strength
  -pre <int> ............. pre-processing filter
 The main options you might want to try in order to further tune the
 visual quality are:
 -preset
@ -243,21 +256,26 @@ Decodes the WebP image file to PNG format [Default]
 Use following options to convert into alternate image formats:
  -pam ......... save the raw RGBA samples as a color PAM
  -ppm ......... save the raw RGB samples as a color PPM
  -bmp ......... save as uncompressed BMP format
  -tiff ........ save as uncompressed TIFF format
  -pgm ......... save the raw YUV samples as a grayscale PGM
-                 file with IMC4 layout.
+                 file with IMC4 layout
-  -yuv ......... save the raw YUV samples in flat layout.
+  -yuv ......... save the raw YUV samples in flat layout
 Other options are:
-  -version  .... print version number and exit.
+  -version  .... print version number and exit
-  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofancy ..... don't use the fancy YUV420 upscaler
-  -nofilter .... disable in-loop filtering.
+  -nofilter .... disable in-loop filtering
  -nodither .... disable dithering
  -dither <d> .. dithering strength (in 0..100)
  -mt .......... use multi-threading
  -crop <x> <y> <w> <h> ... crop output with the given rectangle
  -scale <w> <h> .......... scale the output (*after* any cropping)
-  -alpha ....... only save the alpha plane.
+  -alpha ....... only save the alpha plane
-  -h     ....... this help message.
+  -incremental . use incremental decoding (useful for tests)
  -h     ....... this help message
  -v     ....... verbose (e.g. print encoding/decoding times)
-  -noasm ....... disable all assembly optimizations.
+  -noasm ....... disable all assembly optimizations
 Visualization tool:
 ===================
@ -271,18 +289,19 @@ Usage: vwebp in_file [options]
 Decodes the WebP image file and visualize it using OpenGL
 Options are:
-  -version  .... print version number and exit.
+  -version  .... print version number and exit
-  -noicc ....... don't use the icc profile if present.
+  -noicc ....... don't use the icc profile if present
-  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofancy ..... don't use the fancy YUV420 upscaler
-  -nofilter .... disable in-loop filtering.
+  -nofilter .... disable in-loop filtering
-  -mt .......... use multi-threading.
+  -dither <int>  dithering strength (0..100), default=50
-  -info ........ print info.
+  -mt .......... use multi-threading
-  -h     ....... this help message.
+  -info ........ print info
  -h     ....... this help message
 Keyboard shortcuts:
-  'c' ................ toggle use of color profile.
+  'c' ................ toggle use of color profile
-  'i' ................ overlay file information.
+  'i' ................ overlay file information
-  'q' / 'Q' / ESC .... quit.
+  'q' / 'Q' / ESC .... quit
 Building:
 ---------
@ -310,6 +329,43 @@ $ make -f makefile.unix examples/vwebp
 > nmake /f Makefile.vc CFG=release-static \
    ../obj/x64/release-static/bin/vwebp.exe
 Animated GIF conversion:
 ========================
 Animated GIF files can be converted to WebP files with animation using the
 gif2webp utility available under examples/. The files can then be viewed using
 vwebp.
 Usage:
 gif2webp [options] gif_file -o webp_file
 Options:
  -h / -help  ............ this help
  -lossy ................. encode image using lossy compression
  -mixed ................. for each frame in the image, pick lossy
                           or lossless compression heuristically
  -q <float> ............. quality factor (0:small..100:big)
  -m <int> ............... compression method (0=fast, 6=slowest)
  -kmin <int> ............ min distance between key frames
  -kmax <int> ............ max distance between key frames
  -f <int> ............... filter strength (0=off..100)
  -metadata <string> ..... comma separated list of metadata to
                           copy from the input to the output if present
                           Valid values: all, none, icc, xmp (default)
  -mt .................... use multi-threading if available
  -version ............... print version number and exit
  -v ..................... verbose
  -quiet ................. don't print anything
 Building:
 ---------
 With the libgif development files installed, gif2webp can be built using
 makefile.unix:
 $ make -f makefile.unix examples/gif2webp
 or using autoconf:
 $ ./configure --enable-everything
 $ make
 Encoding API:
 =============
@ -387,15 +443,20 @@ The encoding flow looks like:
  // Set up a byte-output write method. WebPMemoryWriter, for instance.
  WebPMemoryWriter wrt;
  WebPMemoryWriterInit(&wrt);     // initialize 'wrt'
  pic.writer = MyFileWriter;
  pic.custom_ptr = my_opaque_structure_to_make_MyFileWriter_work;
  // initialize 'wrt' here...
  // Compress!
  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
  WebPPictureFree(&pic);  // must be called independently of the 'ok' result.
  // output data should have been handled by the writer at that point.
  // -> compressed data is the memory buffer described by wrt.mem / wrt.size
  // deallocate the memory used by compressed data
  WebPMemoryWriterClear(&wrt);
 -------------------------------------- END PSEUDO EXAMPLE
--- a/README.mux
+++ b/README.mux
@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.1.0
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.2
 Description:
@ -33,34 +33,35 @@ Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
       webpmux -version
 GET_OPTIONS:
- Extract relevant data.
+ Extract relevant data:
-   icc       Get ICC profile.
+   icc       get ICC profile
-   exif      Get EXIF metadata.
+   exif      get EXIF metadata
-   xmp       Get XMP metadata.
+   xmp       get XMP metadata
-   frame n   Get nth frame.
+   frame n   get nth frame
 SET_OPTIONS:
- Set color profile/metadata.
+ Set color profile/metadata:
-   icc  file.icc     Set ICC profile.
+   icc  file.icc     set ICC profile
-   exif file.exif    Set EXIF metadata.
+   exif file.exif    set EXIF metadata
-   xmp  file.xmp     Set XMP metadata.
+   xmp  file.xmp     set XMP metadata
   where:    'file.icc' contains the ICC profile to be set,
             'file.exif' contains the EXIF metadata to be set
             'file.xmp' contains the XMP metadata to be set
 STRIP_OPTIONS:
- Strip color profile/metadata.
+ Strip color profile/metadata:
-   icc       Strip ICC profile.
+   icc       strip ICC profile
-   exif      Strip EXIF metadata.
+   exif      strip EXIF metadata
-   xmp       Strip XMP metadata.
+   xmp       strip XMP metadata
 FRAME_OPTIONS(i):
- Create animation.
+ Create animation:
-   file_i +di+xi+yi+mi
+   file_i +di+[xi+yi[+mi[bi]]]
   where:    'file_i' is the i'th animation frame (WebP format),
-             'di' is the pause duration before next frame.
+             'di' is the pause duration before next frame,
-             'xi','yi' specify the image offset for this frame.
+             'xi','yi' specify the image offset for this frame,
-             'mi' is the dispose method for this frame (0 or 1).
+             'mi' is the dispose method for this frame (0 or 1),
             'bi' is the blending method for this frame (+b or -b)
 LOOP_COUNT:
 Number of times to repeat the animation.
@ -71,7 +72,7 @@ BACKGROUND_COLOR:
  A,R,G,B
  where:    'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying
            the Alpha, Red, Green and Blue component values respectively
-            [Default: 255,255,255,255].
+            [Default: 255,255,255,255]
 INPUT & OUTPUT are in WebP format.
--- a/configure.ac
+++ b/configure.ac
@ -1,19 +1,116 @@
-AC_INIT([libwebp], [0.3.0],
+AC_INIT([libwebp], [0.4.3],
        [http://code.google.com/p/webp/issues],,
        [http://developers.google.com/speed/webp])
-AC_CANONICAL_TARGET
+AC_CANONICAL_HOST
 AC_PREREQ([2.60])
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
 dnl === automake >= 1.12 requires this for 'unusual archivers' support.
 dnl === it must occur before LT_INIT (AC_PROG_LIBTOOL).
 m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 AC_PROG_LIBTOOL
 AM_PROG_CC_C_O
 dnl === Enable less verbose output when building.
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 dnl == test endianness
 AC_C_BIGENDIAN
 dnl === SET_IF_UNSET(shell_var, value)
 dnl ===   Set the shell variable 'shell_var' to 'value' if it is unset.
 AC_DEFUN([SET_IF_UNSET], [test "${$1+set}" = "set" || $1=$2])
 AC_ARG_ENABLE([everything],
              AS_HELP_STRING([--enable-everything],
                             [Enable all optional targets. These can still be
                              disabled with --disable-target]),
              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
               SET_IF_UNSET([enable_libwebpmux], [$enableval])])
 AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=DIR],
            [Path to the pkgconfig directory @<:@LIBDIR/pkgconfig@:>@]),
            [pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
 AC_SUBST([pkgconfigdir])
 dnl === TEST_AND_ADD_CFLAGS(var, flag)
 dnl ===   Checks whether $CC supports 'flag' and adds it to 'var'
 dnl ===   on success.
 AC_DEFUN([TEST_AND_ADD_CFLAGS],
         [SAVED_CFLAGS="$CFLAGS"
          CFLAGS="-Werror $2"
          AC_MSG_CHECKING([whether $CC supports $2])
          dnl Note AC_LANG_PROGRAM([]) uses an old-style main definition.
          AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(void) { return 0; }])],
                            [AC_MSG_RESULT([yes])]
                            dnl Simply append the variable avoiding a
                            dnl compatibility ifdef for AS_VAR_APPEND as this
                            dnl variable shouldn't grow all that large.
                            [$1="${$1} $2"],
                            [AC_MSG_RESULT([no])])
          CFLAGS="$SAVED_CFLAGS"])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-nonliteral])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wvla])
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62040
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61622
 AS_IF([test "$GCC" = "yes" ], [
       gcc_version=`$CC -dumpversion`
       gcc_wht_bug=""
       case "$host_cpu" in
         aarch64|arm64)
          case "$gcc_version" in
            4.9|4.9.0|4.9.1) gcc_wht_bug=yes ;;
          esac
       esac
       AS_IF([test "$gcc_wht_bug" = "yes"], [
              TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-frename-registers])])])
 AC_SUBST([AM_CFLAGS])
 dnl === Check for machine specific flags
 TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
 AS_IF([test -n "$AVX2_FLAGS"], [
  SAVED_CFLAGS=$CFLAGS
  CFLAGS="$CFLAGS $AVX2_FLAGS"
  AC_CHECK_HEADER([immintrin.h],
                  [AC_DEFINE(WEBP_HAVE_AVX2, [1],
                   [Set to 1 if AVX2 is supported])],
                  [AVX2_FLAGS=""],
                  dnl it's illegal to directly include avx2intrin.h, but it's
                  dnl included conditionally in immintrin.h, tricky!
                  [#ifndef __AVX2__
                   #error avx2 is not enabled
                   #endif
                  ])
  CFLAGS=$SAVED_CFLAGS])
 AC_SUBST([AVX2_FLAGS])
 TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2])
 AS_IF([test -n "$SSE2_FLAGS"], [
  SAVED_CFLAGS=$CFLAGS
  CFLAGS="$CFLAGS $SSE2_FLAGS"
  AC_CHECK_HEADER([emmintrin.h],
                  [AC_DEFINE(WEBP_HAVE_SSE2, [1],
                   [Set to 1 if SSE2 is supported])],
                  [SSE2_FLAGS=""])
  CFLAGS=$SAVED_CFLAGS])
 AC_SUBST([SSE2_FLAGS])
 dnl === CLEAR_LIBVARS([var_pfx])
 dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
 AC_DEFUN([CLEAR_LIBVARS], [$1_INCLUDES=""; $1_LIBS=""])
 dnl === WITHLIB_OPTION([opt_pfx], [outvar_pfx])
 dnl ===   Defines --with-<opt_pfx>{include,lib}dir options which set
 dnl ===   the variables <outvar_pfx>_{INCLUDES,LIBS}.
@ -27,6 +124,44 @@ AC_DEFUN([WITHLIB_OPTION],
                              [use $2 libraries from DIR]),
               [$2_LIBS="-L$withval"])])
 dnl === LIBCHECK_PROLOGUE([var_pfx])
 dnl ===   Caches the current values of CPPFLAGS/LIBS in SAVED_* then
 dnl ===   prepends the current values with <var_pfx>_{INCLUDES,LIBS}.
 AC_DEFUN([LIBCHECK_PROLOGUE],
         [SAVED_CPPFLAGS=$CPPFLAGS
          SAVED_LIBS=$LIBS
          CPPFLAGS="$$1_INCLUDES $CPPFLAGS"
          LIBS="$$1_LIBS $LIBS"])
 dnl === LIBCHECK_EPILOGUE([var_pfx])
 dnl ===   Restores the values of CPPFLAGS/LIBS from SAVED_* and exports
 dnl ===   <var_pfx>_{INCLUDES,LIBS} with AC_SUBST.
 AC_DEFUN([LIBCHECK_EPILOGUE],
         [AC_SUBST($1_LIBS)
          AC_SUBST($1_INCLUDES)
          CPPFLAGS=$SAVED_CPPFLAGS
          LIBS=$SAVED_LIBS])
 dnl === Check for gcc builtins
 dnl === CHECK_FOR_BUILTIN([builtin], [param], [define])
 dnl ===   links a C AC_LANG_PROGRAM, with <builtin>(<param>)
 dnl ===   AC_DEFINE'ing <define> if successful.
 AC_DEFUN([CHECK_FOR_BUILTIN],
         [AC_LANG_PUSH([C])
          AC_MSG_CHECKING([for $1])
          AC_LINK_IFELSE([AC_LANG_PROGRAM([], [$1($2)])],
                         [AC_MSG_RESULT([yes])
                          AC_DEFINE([$3], [1],
                                    [Set to 1 if $1 is available])],
                         [AC_MSG_RESULT([no])]),
          AC_LANG_POP])
 dnl AC_CHECK_FUNC doesn't work with builtin's.
 CHECK_FOR_BUILTIN([__builtin_bswap16], [1u << 15], [HAVE_BUILTIN_BSWAP16])
 CHECK_FOR_BUILTIN([__builtin_bswap32], [1u << 31], [HAVE_BUILTIN_BSWAP32])
 CHECK_FOR_BUILTIN([__builtin_bswap64], [1ull << 63], [HAVE_BUILTIN_BSWAP64])
 dnl === Check for pthread support
 AC_ARG_ENABLE([threading],
              AS_HELP_STRING([--disable-threading],
@ -40,32 +175,139 @@ if test "$enable_threading" = "yes"; then
              CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
              CC="$PTHREAD_CC"
             ],
-             [enable_threading=no])
+             [AC_CHECK_FUNC([_beginthreadex],
                            [AC_DEFINE([WEBP_USE_THREAD], [1],
                                       [Undefine this to disable thread
                                        support.])],
                            [enable_threading=no])])
 fi
 AC_MSG_NOTICE([checking if threading is enabled... ${enable_threading-no}])
 dnl === check for OpenGL/GLUT support ===
 AC_ARG_ENABLE([gl], AS_HELP_STRING([--disable-gl],
                                   [Disable detection of OpenGL support
                                    @<:@default=auto@:>@]))
 AS_IF([test "x$enable_gl" != "xno"], [
  CLEAR_LIBVARS([GL])
  WITHLIB_OPTION([gl], [GL])
  LIBCHECK_PROLOGUE([GL])
  glut_cflags="none"
  glut_ldflags="none"
  case $host_os in
    darwin*)
      # Special case for OSX builds. Append these to give the user a chance to
      # override with --with-gl*
      glut_cflags="$glut_cflags|-framework GLUT -framework OpenGL"
      glut_ldflags="$glut_ldflags|-framework GLUT -framework OpenGL"
      ;;
  esac
  GLUT_SAVED_CPPFLAGS="$CPPFLAGS"
  SAVED_IFS="$IFS"
  IFS="|"
  for flag in $glut_cflags; do
    # restore IFS immediately as the autoconf macros may need the default.
    IFS="$SAVED_IFS"
    unset ac_cv_header_GL_glut_h
    unset ac_cv_header_OpenGL_glut_h
    case $flag in
      none) ;;
      *) CPPFLAGS="$flag $CPPFLAGS";;
    esac
    AC_CHECK_HEADERS([GL/glut.h GLUT/glut.h OpenGL/glut.h],
                     [glut_headers=yes;
                      test "$flag" = "none" || GL_INCLUDES="$CPPFLAGS";
                      break])
    CPPFLAGS="$GLUT_SAVED_CPPFLAGS"
    test "$glut_headers" = "yes" && break
  done
  IFS="$SAVED_IFS"
  if test "$glut_headers" = "yes"; then
    AC_LANG_PUSH([C])
    GLUT_SAVED_LDFLAGS="$LDFLAGS"
    SAVED_IFS="$IFS"
    IFS="|"
    for flag in $glut_ldflags; do
      # restore IFS immediately as the autoconf macros may need the default.
      IFS="$SAVED_IFS"
      unset ac_cv_search_glBegin
      case $flag in
        none) ;;
        *) LDFLAGS="$flag $LDFLAGS";;
      esac
      # find libGL
      GL_SAVED_LIBS="$LIBS"
      AC_SEARCH_LIBS([glBegin], [GL OpenGL opengl32])
      LIBS="$GL_SAVED_LIBS"
      # A direct link to libGL may not be necessary on e.g., linux.
      GLUT_SAVED_LIBS="$LIBS"
      for lib in "" "-lglut" "-lglut $ac_cv_search_glBegin"; do
        LIBS="$lib"
        AC_LINK_IFELSE(
          [AC_LANG_PROGRAM([
             #ifdef __cplusplus
             # define EXTERN_C extern "C"
             #else
             # define EXTERN_C
             #endif
             EXTERN_C char glOrtho();
             EXTERN_C char glutMainLoop();
            ],[
             glOrtho();
             glutMainLoop();
            ])
          ],
          AC_DEFINE(WEBP_HAVE_GL, [1],
                    [Set to 1 if OpenGL is supported])
          [glut_support=yes], []
        )
        if test "$glut_support" = "yes"; then
          GL_LIBS="$LDFLAGS $lib"
          break
        fi
      done
      LIBS="$GLUT_SAVED_LIBS"
      LDFLAGS="$GLUT_SAVED_LDFLAGS"
      test "$glut_support" = "yes" && break
    done
    IFS="$SAVED_IFS"
    AC_LANG_POP
  fi
  LIBCHECK_EPILOGUE([GL])
  if test "$glut_support" = "yes" -a "$enable_libwebpdemux" = "yes"; then
    build_vwebp=yes
  fi
 ])
 AM_CONDITIONAL([BUILD_VWEBP], [test "$build_vwebp" = "yes"])
 dnl === check for PNG support ===
-PNG_INCLUDES=""
+AC_ARG_ENABLE([png], AS_HELP_STRING([--disable-png],
-PNG_LIBS=""
+                                    [Disable detection of PNG format support
-AC_PATH_PROGS(LIBPNG_CONFIG,
+                                     @<:@default=auto@:>@]))
-              [libpng-config libpng15-config libpng14-config libpng12-config])
+AS_IF([test "x$enable_png" != "xno"], [
  CLEAR_LIBVARS([PNG])
  AC_PATH_PROGS([LIBPNG_CONFIG],
                [libpng-config libpng16-config libpng15-config libpng14-config \
                 libpng12-config])
  if test -n "$LIBPNG_CONFIG"; then
    PNG_INCLUDES=`$LIBPNG_CONFIG --cflags`
-  PNG_PREFIX=`$LIBPNG_CONFIG --prefix`
+    PNG_LIBS="`$LIBPNG_CONFIG --ldflags`"
  if test "${PNG_PREFIX}/lib" != "/usr/lib" ; then
    PNG_LIBS="-L${PNG_PREFIX}/lib"
  fi
  PNG_LIBS="$PNG_LIBS `$LIBPNG_CONFIG --libs`"
  fi
  WITHLIB_OPTION([png], [PNG])
-SAVED_CPPFLAGS=$CPPFLAGS
+  LIBCHECK_PROLOGUE([PNG])
 SAVED_LIBS=$LIBS
 CPPFLAGS="$PNG_INCLUDES $CPPFLAGS"
 LIBS="$PNG_LIBS $LIBS"
  AC_CHECK_HEADER(png.h,
    AC_SEARCH_LIBS(png_get_libpng_ver, [png],
                   [test "$ac_cv_search_png_get_libpng_ver" = "none required" \
@ -85,23 +327,20 @@ AC_CHECK_HEADER(png.h,
     PNG_INCLUDES=""
    ],
  )
-AC_SUBST(PNG_LIBS)
+  LIBCHECK_EPILOGUE([PNG])
-AC_SUBST(PNG_INCLUDES)
+])
 CPPFLAGS=$SAVED_CPPFLAGS
 LIBS=$SAVED_LIBS
 dnl === check for JPEG support ===
-JPEG_INCLUDES=""
+AC_ARG_ENABLE([jpeg],
-JPEG_LIBS=""
+              AS_HELP_STRING([--disable-jpeg],
                             [Disable detection of JPEG format support
                              @<:@default=auto@:>@]))
 AS_IF([test "x$enable_jpeg" != "xno"], [
  CLEAR_LIBVARS([JPEG])
  WITHLIB_OPTION([jpeg], [JPEG])
-SAVED_CPPFLAGS=$CPPFLAGS
+  LIBCHECK_PROLOGUE([JPEG])
 SAVED_LIBS=$LIBS
 CPPFLAGS="$JPEG_INCLUDES $CPPFLAGS"
 LIBS="$JPEG_LIBS $LIBS"
  AC_CHECK_HEADER(jpeglib.h,
    AC_CHECK_LIB(jpeg, jpeg_set_defaults,
                 [JPEG_LIBS="$JPEG_LIBS -ljpeg"
@ -114,23 +353,20 @@ AC_CHECK_HEADER(jpeglib.h,
                 [$MATH_LIBS]),
    AC_MSG_WARN(jpeg library not available - no jpeglib.h)
  )
-AC_SUBST(JPEG_LIBS)
+  LIBCHECK_EPILOGUE([JPEG])
-AC_SUBST(JPEG_INCLUDES)
+])
 CPPFLAGS=$SAVED_CPPFLAGS
 LIBS=$SAVED_LIBS
 dnl === check for TIFF support ===
-TIFF_INCLUDES=""
+AC_ARG_ENABLE([tiff],
-TIFF_LIBS=""
+              AS_HELP_STRING([--disable-tiff],
                             [Disable detection of TIFF format support
                              @<:@default=auto@:>@]))
 AS_IF([test "x$enable_tiff" != "xno"], [
  CLEAR_LIBVARS([TIFF])
  WITHLIB_OPTION([tiff], [TIFF])
-SAVED_CPPFLAGS=$CPPFLAGS
+  LIBCHECK_PROLOGUE([TIFF])
 SAVED_LIBS=$LIBS
 CPPFLAGS="$TIFF_INCLUDES $CPPFLAGS"
 LIBS="$TIFF_LIBS $LIBS"
  AC_CHECK_HEADER(tiffio.h,
    AC_CHECK_LIB(tiff, TIFFGetVersion,
                 [TIFF_LIBS="$TIFF_LIBS -ltiff"
@ -143,47 +379,50 @@ AC_CHECK_HEADER(tiffio.h,
                 [$MATH_LIBS]),
    AC_MSG_WARN(tiff library not available - no tiffio.h)
  )
-AC_SUBST(TIFF_LIBS)
+  LIBCHECK_EPILOGUE([TIFF])
-AC_SUBST(TIFF_INCLUDES)
+])
 CPPFLAGS=$SAVED_CPPFLAGS
 LIBS=$SAVED_LIBS
 dnl === check for GIF support ===
-GIF_INCLUDES=""
+AC_ARG_ENABLE([gif], AS_HELP_STRING([--disable-gif],
-GIF_LIBS=""
+                                    [Disable detection of GIF format support
                                     @<:@default=auto@:>@]))
 AS_IF([test "x$enable_gif" != "xno"], [
  CLEAR_LIBVARS([GIF])
  WITHLIB_OPTION([gif], [GIF])
-SAVED_CPPFLAGS=$CPPFLAGS
+  LIBCHECK_PROLOGUE([GIF])
 SAVED_LIBS=$LIBS
 CPPFLAGS="$GIF_INCLUDES $CPPFLAGS"
 LIBS="$GIF_LIBS $LIBS"
  AC_CHECK_HEADER(gif_lib.h,
    AC_CHECK_LIB([gif], [DGifOpenFileHandle],
                 [GIF_LIBS="$GIF_LIBS -lgif"
                  AC_DEFINE(WEBP_HAVE_GIF, [1],
                            [Set to 1 if GIF library is installed])
                  gif_support=yes
                 ],
                 AC_MSG_WARN(Optional gif library not found),
                 [$MATH_LIBS]),
    AC_MSG_WARN(gif library not available - no gif_lib.h)
  )
-AC_SUBST(GIF_LIBS)
+  LIBCHECK_EPILOGUE([GIF])
 AC_SUBST(GIF_INCLUDES)
 CPPFLAGS=$SAVED_CPPFLAGS
 LIBS=$SAVED_LIBS
  if test "$gif_support" = "yes" -a \
          "$enable_libwebpmux" = "yes"; then
    build_gif2webp=yes
  fi
 ])
 AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])
 dnl === check for WIC support ===
-if test "$target_os" = "mingw32"; then
+AC_ARG_ENABLE([wic],
              AS_HELP_STRING([--disable-wic],
                             [Disable Windows Imaging Component (WIC) detection.
                              @<:@default=auto@:>@]),,
              [enable_wic=yes])
 case $host_os in
 mingw*)
 if test "$enable_wic" = "yes"; then
  AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
  if test "$ac_cv_header_wincodec_h" = "yes"; then
    AC_MSG_CHECKING(for Windows Imaging Component support)
@ -223,6 +462,20 @@ if test "$target_os" = "mingw32"; then
    AC_MSG_RESULT(${wic_support-no})
  fi
 fi
 esac
 dnl === If --enable-aligned is defined, define WEBP_FORCE_ALIGNED
 AC_MSG_CHECKING(if --enable-aligned option is specified)
 AC_ARG_ENABLE([aligned],
              AS_HELP_STRING([--enable-aligned],
                             [Force aligned memory operations in non-dsp code
                              (may be slower)]))
 if test "$enable_aligned" = "yes"; then
  AC_DEFINE(WEBP_FORCE_ALIGNED, [1],
            [Define to 1 to force aligned memory operations])
 fi
 AC_MSG_RESULT(${enable_aligned-no})
 dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP
@ -277,7 +530,7 @@ AM_CONDITIONAL([BUILD_LIBWEBPDECODER], [test "$enable_libwebpdecoder" = "yes"])
 dnl =========================
 AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_HEADERS([src/webp/config.h])
 AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile src/dec/Makefile \
                 src/enc/Makefile src/dsp/Makefile \
@ -295,7 +548,7 @@ WebP Configuration Summary
 Shared libraries: ${enable_shared}
 Static libraries: ${enable_static}
-Threaded decode: ${enable_threading-no}
+Threading support: ${enable_threading-no}
 libwebp: yes
 libwebpdecoder: ${enable_libwebpdecoder-no}
 libwebpdemux: ${enable_libwebpdemux-no}
@ -314,6 +567,8 @@ dwebp: yes
  =====================
  PNG  : ${png_support-no}
  WIC  : ${wic_support-no}
 GIF support : ${gif_support-no}
 gif2webp    : ${build_gif2webp-no}
 webpmux     : ${enable_libwebpmux-no}
 vwebp       : ${build_vwebp-no}
 ])
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -46,25 +46,16 @@ for:
  * **Animation.** An image may have multiple frames with pauses between them,
    making it an animation.
  * **Image Fragmentation.** A single bitstream in WebP has an inherent
    limitation for width or height of 2^14 pixels, and, when using VP8, a 512
    KiB limit on the size of the first compressed partition. To support larger
    images, the format supports images that are composed of multiple fragments,
    each encoded as a separate bitstream. All fragments logically form a single
    image: they have common metadata, color profile, etc. Image fragmentation
    may also improve efficiency for larger images, e.g., grass can be encoded
    differently than sky.
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
 "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
 document are to be interpreted as described in [RFC 2119][].
 Bit numbering in chunk diagrams starts at `0` for the most significant bit
 ('MSB 0') as described in [RFC 1166][].
 **Note:** Out of the features mentioned above, lossy compression, lossless
 compression, transparency, metadata, color profile and animation are finalized
-and are to be considered stable. On the other hand, image fragmentation is
+and are to be considered stable.
 experimental as of now, and is open to discussion, feedback and comments.
 The same is indicated using annotation "_status: experimental_" in the relevant
 sections of this document.
 Terminology &amp; Basics
 ------------------------
@ -103,8 +94,10 @@ _1-based_
 : An unsigned integer field storing values offset by `-1`. e.g., Such a field
  would store value _25_ as _24_.
-RIFF file format
+
 RIFF File Format
 ----------------
 The WebP file format is based on the RIFF (resource interchange file format)
 document format.
@ -144,7 +137,8 @@ _ChunkHeader('ABCD')_
 chunks that apply to any RIFF file format, while FourCCs specific to a file
 format are all lowercase. WebP does not follow this convention.
-WebP file header
+
 WebP File Header
 ----------------
     0                   1                   2                   3
@ -177,7 +171,8 @@ the 'WEBP' FourCC. The file SHOULD NOT contain anything after it. As the size
 of any chunk is even, the size given by the RIFF header is also even. The
 contents of individual chunks will be described in the following sections.
-Simple file format (lossy)
+
 Simple File Format (Lossy)
 --------------------------
 This layout SHOULD be used if the image requires _lossy_ encoding and does not
@ -215,7 +210,8 @@ width and height. That is assumed to be the width and height of the canvas.
 The VP8 specification describes how to decode the image into Y'CbCr
 format. To convert to RGB, Rec. 601 SHOULD be used.
-Simple file format (lossless)
+
 Simple File Format (Lossless)
 -----------------------------
 **Note:** Older readers may not support files using the lossless format.
@ -253,7 +249,8 @@ The current specification of the VP8L bitstream can be found at
 contains the VP8L image width and height. That is assumed to be the width
 and height of the canvas.
-Extended file format
+
 Extended File Format
 --------------------
 **Note:** Older readers may not support files using the extended format.
@ -278,10 +275,6 @@ For a _still image_, the _image data_ consists of a single frame, whereas for
 an _animated image_, it consists of multiple frames. More details about frames
 can be found in the [Animation](#animation) section.
 Moreover, each frame can be fragmented or non-fragmented, as will be described
 in the [Extended WebP file header](#extended_header) section. More details about
 fragments can be found in the [Fragments](#fragments) section.
 All chunks SHOULD be placed in the same order as listed above. If a chunk
 appears in the wrong place, the file is invalid, but readers MAY parse the
 file, ignoring the chunks that come too late.
@ -302,7 +295,7 @@ Extended WebP file header:
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                      ChunkHeader('VP8X')                      |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |Rsv|I|L|E|X|A|F|                   Reserved                    |
+    |Rsv|I|L|E|X|A|R|                   Reserved                    |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |          Canvas Width Minus One               |             ...
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@ -335,9 +328,9 @@ Animation (A): 1 bit
 : Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
  used to control the animation.
-Image Fragmentation (F): 1 bit _\[status: experimental\]_
+Reserved (R): 1 bit
-: Set if any of the frames in the image are represented by fragments.
+: SHOULD be `0`.
 Reserved: 24 bits
@ -382,13 +375,20 @@ animation.
 Background Color: 32 bits (_uint32_)
 : The default background color of the canvas in \[Blue, Green, Red, Alpha\]
-byte order. This color is used to fill the unused space on the canvas around the
+  byte order. This color MAY be used to fill the unused space on the canvas
-frames, as well as the transparent pixels of the first frame. Background color
+  around the frames, as well as the transparent pixels of the first frame.
-is also used when disposal method is `1`.
+  Background color is also used when disposal method is `1`.
-**Note**: Viewers that have a preferred background against which to present the
+**Note**:
-images (web browsers, for example) should ignore this value and use their
+
-preferred background color instead.
+  * Background color MAY contain a transparency value (alpha), even if the
    _Alpha_ flag in [VP8X chunk](#extended_header) is unset.
  * Viewer applications SHOULD treat the background color value as a hint, and
    are not required to use it.
  * The canvas is cleared at the start of each loop. The background color MAY be
    used to achieve this.
 Loop Count: 16 bits (_uint16_)
@ -398,7 +398,6 @@ This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
 If the _Animation_ flag is not set and this chunk is present, it
 SHOULD be ignored.
 ANMF chunk:
 For animated images, this chunk contains information about a _single_ frame.
@ -415,7 +414,7 @@ If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    ...             |           Frame Height Minus One              |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                 Frame Duration                |  Reserved   |D|
+    |                 Frame Duration                |  Reserved |B|D|
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                         Frame Data                            |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@ -441,28 +440,38 @@ Frame Height Minus One: 24 bits (_uint24_)
 Frame Duration: 24 bits (_uint24_)
 : The time to wait before displaying the next frame, in 1 millisecond units.
-In particular, frame duration of 0 is useful when one wants to update multiple
+  In particular, frame duration of 0 is useful when one wants to update
-areas of the canvas at once during the animation.
+  multiple areas of the canvas at once during the animation.
-Reserved: 7 bits
+Reserved: 6 bits
 : SHOULD be 0.
 Blending method (B): 1 bit
 : Indicates how transparent pixels of _the current frame_ are to be blended
  with corresponding pixels of the previous canvas:
    * `0`: Use alpha blending. After disposing of the previous frame, render the
      current frame on the canvas using [alpha-blending](#alpha-blending). If
      the current frame does not have an alpha channel, assume alpha value of
      255, effectively replacing the rectangle.
    * `1`: Do not blend. After disposing of the previous frame, render the
      current frame on the canvas by overwriting the rectangle covered by the
      current frame.
 Disposal method (D): 1 bit
-: Indicates how _the current frame_ is to be treated after it has been displayed
+: Indicates how _the current frame_ is to be treated after it has been
-(before rendering the next frame) on the canvas:
+  displayed (before rendering the next frame) on the canvas:
    * `0`: Do not dispose. Leave the canvas as is.
-  * `1`: Dispose to background color. Fill the _rectangle_ on the canvas covered
+    * `1`: Dispose to background color. Fill the _rectangle_ on the canvas
-    by the _current frame_ with background color specified in the
+      covered by the _current frame_ with background color specified in the
      [ANIM chunk](#anim_chunk).
 After disposing the current frame, render the next frame on the canvas using
 [alpha-blending](#alpha-blending). If the next frame does not have an alpha
 channel, assume alpha value of 255, effectively replacing the rectangle.
 **Notes**:
  * The frame disposal only applies to the _frame rectangle_, that is, the
@ -492,9 +501,7 @@ channel, assume alpha value of 255, effectively replacing the rectangle.
 Frame Data: _Chunk Size_ - `16` bytes
-: For a fragmented frame, it consists of multiple [fragment chunks](#fragments).
+: Consists of:
 : For a non-fragmented frame, it consists of:
  * An optional [alpha subchunk](#alpha) for the frame.
@ -505,49 +512,6 @@ Frame Data: _Chunk Size_ - `16` bytes
 **Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
 _padded_ chunks as described by the [RIFF file format](#riff-file-format).
 #### Fragments _\[status: experimental\]_
 For images that are represented by fragments, this chunk contains data for
 a single fragment. If the _Image Fragmentation Flag_ is not set, then this chunk
 SHOULD NOT be present.
     0                   1                   2                   3
     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                      ChunkHeader('FRGM')                      |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                  Fragment X                   |             ...
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    ...       Fragment Y            |         Fragment Data         |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 Fragment X: 24 bits (_uint24_)
 : The X coordinate of the upper left corner of the fragment is `Fragment X * 2`
 Fragment Y: 24 bits (_uint24_)
 : The Y coordinate of the upper left corner of the fragment is `Fragment Y * 2`
 Fragment Data: _Chunk Size_ - `6` bytes
 : It contains:
  * An optional [alpha subchunk](#alpha) for the fragment.
  * The [bitstream subchunk](#bitstream-vp8vp8l) for the fragment.
  * An optional list of [unknown chunks](#unknown-chunks).
 Note: The width and height of the fragment is obtained from the bitstream
 subchunk.
 The fragments of a frame SHOULD have the following properties:
  * They collectively cover the whole frame.
  * No pair of fragments have any overlapping region on the frame.
  * No portion of any fragment should be located outside of the canvas.
 #### Alpha
     0                   1                   2                   3
@ -629,8 +593,8 @@ Alpha bitstream: _Chunk Size_ - `1` bytes
 : Encoded alpha bitstream.
-This optional chunk contains encoded alpha data for this frame/fragment. A
+This optional chunk contains encoded alpha data for this frame. A frame
-frame/fragment containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+containing a 'VP8L' chunk SHOULD NOT contain this chunk.
 **Rationale**: The transparency information is already part of the 'VP8L'
 chunk.
@ -661,15 +625,15 @@ compression method is '0') or compressed using the lossless format
 #### Bitstream (VP8/VP8L)
-This chunk contains compressed bitstream data for a single frame/fragment.
+This chunk contains compressed bitstream data for a single frame.
 A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
 significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
 "VP8L" as its tag.
 The formats of VP8 and VP8L chunks are as described in sections
-[Simple file format (lossy)](#simple-file-format-lossy)
+[Simple File Format (Lossy)](#simple-file-format-lossy)
-and [Simple file format (lossless)](#simple-file-format-lossless) respectively.
+and [Simple File Format (Lossless)](#simple-file-format-lossless) respectively.
 #### Color profile
@ -717,7 +681,6 @@ EXIF Metadata: _Chunk Size_ bytes
 : image metadata in EXIF format.
 XMP chunk:
     0                   1                   2                   3
@ -748,47 +711,17 @@ A file MAY contain unknown chunks:
  * At the end of the file as described in [Extended WebP file
    header](#extended_header) section.
-  * At the end of FRGM and ANMF chunks as described in [Fragments](#fragments)
+  * At the end of ANMF chunks as described in the
-    and [Animation](#animation) sections.
+    [Animation](#animation) section.
 Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
 original order (unless they specifically intend to modify these chunks).
-### Assembling the Canvas from fragments/frames
+### Assembling the Canvas from frames
-Here we provide an overview of how a reader should assemble a canvas in case
+Here we provide an overview of how a reader should assemble a canvas in the
-of a fragmented-image and in case of an animated image. The notation
+case of an animated image. The notation _VP8X.field_ means the field in the
-_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
+'VP8X' chunk with the same description.
 Displaying a _fragmented image_ canvas MUST be equivalent to the following
 pseudocode: _\[status: experimental\]_
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 assert VP8X.flags.hasFragments
 canvas ← new black image of size VP8X.canvasWidth x VP8X.canvasHeight.
 frgm_params ← nil
 for chunk in image_data:
    assert chunk.tag is "FRGM"
    frgm_params.fragmentX = Fragment X
    frgm_params.fragmentY = Fragment Y
    for subchunk in 'Fragment Data':
        if subchunk.tag == "ALPH":
            assert alpha subchunks not found in 'Fragment Data' earlier
            frgm_params.alpha = alpha_data
        else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
            assert bitstream subchunks not found in 'Fragment Data' earlier
            frgm_params.bitstream = bitstream_data
    frgm_params.fragmentWidth = Width extracted from bitstream subchunk
    frgm_params.fragmentHeight = Height extracted from bitstream subchunk
    assert VP8X.canvasWidth >=
        frgm_params.fragmentX + frgm_params.fragmentWidth
    assert VP8X.canvasHeight >=
        frgm_params.fragmentY + frgm_params.fragmentHeight
    assert fragment has the properties mentioned in "Image Fragments" section.
    render fragment with frame_params.alpha and frame_params.bitstream on canvas
    with top-left corner in (frgm_params.fragmentX, frgm_params.fragmentY).
 canvas contains the decoded canvas.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Displaying an _animated image_ canvas MUST be equivalent to the following
 pseudocode:
@ -802,22 +735,19 @@ dispose_method ← ANIM.disposeMethod
 if loop_count == 0:
    loop_count = ∞
 frame_params ← nil
 for loop = 0, ..., loop_count - 1
 assert next chunk in image_data is ANMF
 for loop = 0..loop_count - 1
    clear canvas to ANIM.background_color or application defined color
    until eof or non-ANMF chunk
        frame_params.frameX = Frame X
        frame_params.frameY = Frame Y
        frame_params.frameWidth = Frame Width Minus One + 1
        frame_params.frameHeight = Frame Height Minus One + 1
        frame_params.frameDuration = Frame Duration
-    assert VP8X.canvasWidth >= frame_params.frameX + frame_params.frameWidth
+        frame_right = frame_params.frameX + frame_params.frameWidth
-    assert VP8X.canvasHeight >= frame_params.frameY + frame_params.frameHeight
+        frame_bottom = frame_params.frameY + frame_params.frameHeight
-    if VP8X.flags.hasFragments and first subchunk in 'Frame Data' is FRGM
+        assert VP8X.canvasWidth >= frame_right
-        // Fragmented frame.
+        assert VP8X.canvasHeight >= frame_bottom
        frame_params.{bitstream,alpha} = canvas decoded from subchunks in
                                         'Frame Data' as per the pseudocode for
                                         _fragmented image_ above.
    else
        // Non-fragmented frame.
        for subchunk in 'Frame Data':
            if subchunk.tag == "ALPH":
                assert alpha subchunks not found in 'Frame Data' earlier
@ -825,14 +755,15 @@ for loop = 0, ..., loop_count - 1
            else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
                assert bitstream subchunks not found in 'Frame Data' earlier
                frame_params.bitstream = bitstream_data
-    render frame with frame_params.alpha and frame_params.bitstream on canvas
+        render frame with frame_params.alpha and frame_params.bitstream on
-    with top-left corner in (frame_params.frameX, frame_params.frameY), using
+            canvas with top-left corner at (frame_params.frameX,
-    dispose method dispose_method.
+            frame_params.frameY), using dispose method dispose_method.
-    Show the contents of the image for frame_params.frameDuration * 1ms.
+        canvas contains the decoded image.
-canvas contains the decoded canvas.
+        Show the contents of the canvas for frame_params.frameDuration * 1ms.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Example file layouts
+
 Example File Layouts
 --------------------
 A lossy encoded image with alpha may look as follows:
@ -864,17 +795,6 @@ RIFF/WEBP
 +- XMP  (metadata)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A fragmented image may look as follows:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 RIFF/WEBP
 +- VP8X (descriptions of features used)
 +- FRGM (fragment1 parameters + data)
 +- FRGM (fragment2 parameters + data)
 +- FRGM (fragment3 parameters + data)
 +- FRGM (fragment4 parameters + data)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 An animated image with EXIF metadata may look as follows:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -892,4 +812,5 @@ RIFF/WEBP
 [webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
 [iccspec]: http://www.color.org/icc_specs2.xalter
 [metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
 [rfc 1166]: http://tools.ietf.org/html/rfc1166
 [rfc 2119]: http://tools.ietf.org/html/rfc2119
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
@ -14,6 +14,7 @@ Specification for WebP Lossless Bitstream
 _Jyrki Alakuijala, Ph.D., Google, Inc., 2012-06-19_
 Paragraphs marked as \[AMENDED\] were amended on 2014-09-16.
 Abstract
 --------
@ -172,8 +173,8 @@ It should be set to 0 when all alpha values are 255 in the picture, and
 int alpha_is_used = ReadBits(1);
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The version_number is a 3 bit code that must be discarded by the decoder
+The version_number is a 3 bit code that must be set to 0. Any other value
-at this time. Complying encoders write a 3-bit value 0.
+should be treated as an error. \[AMENDED\]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 int version_number = ReadBits(3);
@ -330,7 +331,7 @@ uint32 Select(uint32 L, uint32 T, uint32 TL) {
           abs(pGreen - GREEN(T)) + abs(pBlue - BLUE(T));
  // Return either left or top, the one closer to the prediction.
-  if (pL <= pT) {
+  if (pL < pT) {     // \[AMENDED\]
    return L;
  } else {
    return T;
@ -542,6 +543,9 @@ color.
 argb = color_table[GREEN(argb)];
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 If the index is equal or larger than color_table_size, the argb color value
 should be set to 0x00000000 (transparent black).  \[AMENDED\]
 When the color table is small (equal to or less than 16 colors), several
 pixels are bundled into a single pixel. The pixel bundling packs several
 (2, 4, or 8) pixels into a single pixel, reducing the image width
--- a/examples/Android.mk
+++ b/examples/Android.mk
@ -0,0 +1,71 @@
 LOCAL_PATH := $(call my-dir)
 ################################################################################
 # libexample_util
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
    example_util.c \
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
 LOCAL_MODULE := example_util
 include $(BUILD_STATIC_LIBRARY)
 ################################################################################
 # cwebp
 include $(CLEAR_VARS)
 # Note: to enable jpeg/png encoding the sources from AOSP can be used with
 # minor modification to their Android.mk files.
 LOCAL_SRC_FILES := \
    cwebp.c \
    jpegdec.c \
    metadata.c \
    pngdec.c \
    tiffdec.c \
    webpdec.c \
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
 LOCAL_STATIC_LIBRARIES := example_util webp
 LOCAL_MODULE := cwebp
 include $(BUILD_EXECUTABLE)
 ################################################################################
 # dwebp
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
    dwebp.c \
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
 LOCAL_STATIC_LIBRARIES := example_util webp
 LOCAL_MODULE := dwebp
 include $(BUILD_EXECUTABLE)
 ################################################################################
 # webpmux
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
    webpmux.c \
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
 LOCAL_STATIC_LIBRARIES := example_util webpmux webp
 LOCAL_MODULE := webpmux_example
 include $(BUILD_EXECUTABLE)
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,6 +1,9 @@
-AM_CPPFLAGS = -I$(top_srcdir)/src
+AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_srcdir)/src
 bin_PROGRAMS = dwebp cwebp
 if BUILD_VWEBP
  bin_PROGRAMS += vwebp
 endif
 if WANT_MUX
  bin_PROGRAMS += webpmux
 endif
@ -11,29 +14,25 @@ endif
 noinst_LTLIBRARIES = libexampleutil.la
-libexampleutil_la_SOURCES = example_util.c example_util.h
+libexampleutil_la_SOURCES = example_util.c example_util.h stopwatch.h
 dwebp_SOURCES = dwebp.c stopwatch.h
 dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
 dwebp_LDADD = libexampleutil.la $(PNG_LIBS) $(JPEG_LIBS)
 if BUILD_LIBWEBPDECODER
  dwebp_LDADD += ../src/libwebpdecoder.la
 else
  dwebp_LDADD += ../src/libwebp.la
 endif
 cwebp_SOURCES  = cwebp.c metadata.c metadata.h stopwatch.h
 cwebp_SOURCES += jpegdec.c jpegdec.h
 cwebp_SOURCES += pngdec.c pngdec.h
 cwebp_SOURCES += tiffdec.c tiffdec.h
 cwebp_SOURCES += webpdec.c webpdec.h
 cwebp_SOURCES += wicdec.c wicdec.h
 cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
-cwebp_LDADD = ../src/libwebp.la $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)
+cwebp_LDADD  = libexampleutil.la ../src/libwebp.la
 cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)
-gif2webp_SOURCES = gif2webp.c
+gif2webp_SOURCES = gif2webp.c gif2webp_util.c
 gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
 gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
 gif2webp_LDADD += $(GIF_LIBS)
@ -41,3 +40,15 @@ gif2webp_LDADD += $(GIF_LIBS)
 webpmux_SOURCES = webpmux.c
 webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 webpmux_LDADD = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
 vwebp_SOURCES = vwebp.c
 vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
 vwebp_LDADD = libexampleutil.la ../src/demux/libwebpdemux.la $(GL_LIBS)
 if BUILD_LIBWEBPDECODER
  dwebp_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
 else
  dwebp_LDADD += ../src/libwebp.la
  vwebp_LDADD += ../src/libwebp.la
 endif
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  simple command line calling the WebPEncode function.
@ -15,27 +17,29 @@
 #include <string.h>
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #include "webp/encode.h"
 #include "./example_util.h"
 #include "./metadata.h"
 #include "./stopwatch.h"
 #include "./jpegdec.h"
 #include "./pngdec.h"
 #include "./tiffdec.h"
 #include "./webpdec.h"
 #include "./wicdec.h"
 #ifndef WEBP_DLL
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 extern void* VP8GetCPUInfo;   // opaque forward declaration.
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_DLL
@ -91,6 +95,9 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic,
  } else {
    // If no size specified, try to decode it using WIC.
    ok = ReadPictureWithWIC(filename, pic, keep_alpha, metadata);
    if (!ok) {
      ok = ReadWebP(filename, pic, keep_alpha, metadata);
    }
  }
  if (!ok) {
    fprintf(stderr, "Error! Could not process file %s\n", filename);
@ -104,26 +111,30 @@ typedef enum {
  PNG_ = 0,
  JPEG_,
  TIFF_,  // 'TIFF' clashes with libtiff
  WEBP_,
  UNSUPPORTED
 } InputFileFormat;
 static InputFileFormat GetImageType(FILE* in_file) {
  InputFileFormat format = UNSUPPORTED;
-  unsigned int magic;
+  uint32_t magic1, magic2;
-  unsigned char buf[4];
+  uint8_t buf[12];
-  if ((fread(&buf[0], 4, 1, in_file) != 1) ||
+  if ((fread(&buf[0], 12, 1, in_file) != 1) ||
      (fseek(in_file, 0, SEEK_SET) != 0)) {
    return format;
  }
-  magic = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
+  magic1 = ((uint32_t)buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
-  if (magic == 0x89504E47U) {
+  magic2 = ((uint32_t)buf[8] << 24) | (buf[9] << 16) | (buf[10] << 8) | buf[11];
  if (magic1 == 0x89504E47U) {
    format = PNG_;
-  } else if (magic >= 0xFFD8FF00U && magic <= 0xFFD8FFFFU) {
+  } else if (magic1 >= 0xFFD8FF00U && magic1 <= 0xFFD8FFFFU) {
    format = JPEG_;
-  } else if (magic == 0x49492A00 || magic == 0x4D4D002A) {
+  } else if (magic1 == 0x49492A00 || magic1 == 0x4D4D002A) {
    format = TIFF_;
  } else if (magic1 == 0x52494646 && magic2 == 0x57454250) {
    format = WEBP_;
  }
  return format;
 }
@ -146,6 +157,8 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic,
      ok = ReadJPEG(in_file, pic, metadata);
    } else if (format == TIFF_) {
      ok = ReadTIFF(filename, pic, keep_alpha, metadata);
    } else if (format == WEBP_) {
      ok = ReadWebP(filename, pic, keep_alpha, metadata);
    }
  } else {
    // If image size is specified, infer it as YUV format.
@ -264,10 +277,6 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
        fprintf(stderr, "             transparency:   %6d (%.1f dB)\n",
                stats->alpha_data_size, stats->PSNR[4]);
      }
      if (stats->layer_data_size) {
        fprintf(stderr, "             enhancement:    %6d\n",
                stats->layer_data_size);
      }
      fprintf(stderr, " Residuals bytes  "
                      "|segment 1|segment 2|segment 3"
                      "|segment 4|  total\n");
@ -296,6 +305,9 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
      PrintFullLosslessInfo(stats, "alpha");
    }
  }
 }
 static void PrintMapInfo(const WebPPicture* const pic) {
  if (pic->extra_info != NULL) {
    const int mb_w = (pic->width + 15) / 16;
    const int mb_h = (pic->height + 15) / 16;
@ -305,18 +317,18 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
      for (x = 0; x < mb_w; ++x) {
        const int c = pic->extra_info[x + y * mb_w];
        if (type == 1) {   // intra4/intra16
-          printf("%c", "+."[c]);
+          fprintf(stderr, "%c", "+."[c]);
        } else if (type == 2) {    // segments
-          printf("%c", ".-*X"[c]);
+          fprintf(stderr, "%c", ".-*X"[c]);
        } else if (type == 3) {    // quantizers
-          printf("%.2d ", c);
+          fprintf(stderr, "%.2d ", c);
        } else if (type == 6 || type == 7) {
-          printf("%3d ", c);
+          fprintf(stderr, "%3d ", c);
        } else {
-          printf("0x%.2x ", c);
+          fprintf(stderr, "0x%.2x ", c);
        }
      }
-      printf("\n");
+      fprintf(stderr, "\n");
    }
  }
 }
@ -492,11 +504,14 @@ static int WriteWebPWithMetadata(FILE* const out,
    if (has_vp8x) {  // update the existing VP8X flags
      webp[kChunkHeaderSize] |= (uint8_t)(flags & 0xff);
      ok = ok && (fwrite(webp, kVP8XChunkSize, 1, out) == 1);
      webp += kVP8XChunkSize;
      webp_size -= kVP8XChunkSize;
    } else {
      const int is_lossless = !memcmp(webp, "VP8L", kTagSize);
-      // The alpha flag is forced with lossless images.
+      if (is_lossless) {
-      if (is_lossless) flags |= kAlphaFlag;
+        // Presence of alpha is stored in the 29th bit of VP8L data.
        if (webp[kChunkHeaderSize + 3] & (1 << 5)) flags |= kAlphaFlag;
      }
      ok = ok && (fwrite(kVP8XHeader, kChunkHeaderSize, 1, out) == 1);
      ok = ok && WriteLE32(out, flags);
      ok = ok && WriteLE24(out, picture->width - 1);
@ -526,9 +541,8 @@ static int WriteWebPWithMetadata(FILE* const out,
 //------------------------------------------------------------------------------
 static int ProgressReport(int percent, const WebPPicture* const picture) {
-  printf("[%s]: %3d %%      \r",
+  fprintf(stderr, "[%s]: %3d %%      \r",
          (char*)picture->user_data, percent);
  fflush(stdout);
  return 1;  // all ok
 }
@ -545,35 +559,39 @@ static void HelpShort(void) {
 static void HelpLong(void) {
  printf("Usage:\n");
  printf(" cwebp [-preset <...>] [options] in_file [-o out_file]\n\n");
-  printf("If input size (-s) for an image is not specified, "
+  printf("If input size (-s) for an image is not specified, it is\n"
-         "it is assumed to be a PNG, JPEG or TIFF file.\n");
+         "assumed to be a PNG, JPEG, TIFF or WebP file.\n");
 #ifdef HAVE_WINCODEC_H
-  printf("Windows builds can take as input any of the files handled by WIC\n");
+  printf("Windows builds can take as input any of the files handled by WIC.\n");
 #endif
-  printf("options:\n");
+  printf("\nOptions:\n");
  printf("  -h / -help  ............ short help\n");
  printf("  -H / -longhelp  ........ long help\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
-  printf("  -alpha_q <int> ......... Transparency-compression quality "
+  printf("  -alpha_q <int> ......... transparency-compression quality "
-         "(0..100).\n");
+         "(0..100)\n");
-  printf("  -preset <string> ....... Preset setting, one of:\n");
+  printf("  -preset <string> ....... preset setting, one of:\n");
  printf("                            default, photo, picture,\n");
  printf("                            drawing, icon, text\n");
-  printf("     -preset must come first, as it overwrites other parameters.");
+  printf("     -preset must come first, as it overwrites other parameters\n");
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
  printf("  -z <int> ............... activates lossless preset with given\n"
         "                           level in [0:fast, ..., 9:slowest]\n");
 #endif
  printf("\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
  printf("  -segments <int> ........ number of segments to use (1..4)\n");
-  printf("  -size <int> ............ Target size (in bytes)\n");
+  printf("  -size <int> ............ target size (in bytes)\n");
-  printf("  -psnr <float> .......... Target PSNR (in dB. typically: 42)\n");
+  printf("  -psnr <float> .......... target PSNR (in dB. typically: 42)\n");
  printf("\n");
-  printf("  -s <int> <int> ......... Input size (width x height) for YUV\n");
+  printf("  -s <int> <int> ......... input size (width x height) for YUV\n");
-  printf("  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)\n");
+  printf("  -sns <int> ............. spatial noise shaping (0:off, 100:max)\n");
  printf("  -f <int> ............... filter strength (0=off..100)\n");
  printf("  -sharpness <int> ....... "
         "filter sharpness (0:most .. 7:least sharp)\n");
  printf("  -strong ................ use strong filter instead "
-                                     "of simple (default).\n");
+                                     "of simple (default)\n");
-  printf("  -nostrong .............. use simple filter instead of strong.\n");
+  printf("  -nostrong .............. use simple filter instead of strong\n");
  printf("  -partition_limit <int> . limit quality to fit the 512k limit on\n");
  printf("                           "
         "the first partition (0=no degradation ... 100=full)\n");
@ -582,22 +600,23 @@ static void HelpLong(void) {
  printf("  -resize <w> <h> ........ resize picture (after any cropping)\n");
  printf("  -mt .................... use multi-threading if available\n");
  printf("  -low_memory ............ reduce memory usage (slower encoding)\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES
+  printf("  -map <int> ............. print map of extra info\n");
-  printf("  -444 / -422 / -gray ..... Change colorspace\n");
+  printf("  -print_psnr ............ prints averaged PSNR distortion\n");
-#endif
+  printf("  -print_ssim ............ prints averaged SSIM distortion\n");
-  printf("  -map <int> ............. print map of extra info.\n");
+  printf("  -print_lsim ............ prints local-similarity distortion\n");
-  printf("  -print_psnr ............ prints averaged PSNR distortion.\n");
+  printf("  -d <file.pgm> .......... dump the compressed output (PGM file)\n");
-  printf("  -print_ssim ............ prints averaged SSIM distortion.\n");
+  printf("  -alpha_method <int> .... transparency-compression method (0..1)\n");
-  printf("  -print_lsim ............ prints local-similarity distortion.\n");
+  printf("  -alpha_filter <string> . predictive filtering for alpha plane,\n");
-  printf("  -d <file.pgm> .......... dump the compressed output (PGM file).\n");
+  printf("                           one of: none, fast (default) or best\n");
-  printf("  -alpha_method <int> .... Transparency-compression method (0..1)\n");
+  printf("  -alpha_cleanup ......... clean RGB values in transparent area\n");
-  printf("  -alpha_filter <string> . predictive filtering for alpha plane.\n");
+  printf("  -blend_alpha <hex> ..... blend colors against background color\n"
-  printf("                           One of: none, fast (default) or best.\n");
+         "                           expressed as RGB values written in\n"
-  printf("  -alpha_cleanup ......... Clean RGB values in transparent area.\n");
+         "                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0\n"
-  printf("  -noalpha ............... discard any transparency information.\n");
+         "                           green=0xe0 and blue=0xd0\n");
-  printf("  -lossless .............. Encode image losslessly.\n");
+  printf("  -noalpha ............... discard any transparency information\n");
-  printf("  -hint <string> ......... Specify image characteristics hint.\n");
+  printf("  -lossless .............. encode image losslessly\n");
-  printf("                           One of: photo, picture or graph\n");
+  printf("  -hint <string> ......... specify image characteristics hint,\n");
  printf("                           one of: photo, picture or graph\n");
  printf("\n");
  printf("  -metadata <string> ..... comma separated list of metadata to\n");
@ -608,18 +627,18 @@ static void HelpLong(void) {
  printf("\n");
  printf("  -short ................. condense printed message\n");
-  printf("  -quiet ................. don't print anything.\n");
+  printf("  -quiet ................. don't print anything\n");
-  printf("  -version ............... print version number and exit.\n");
+  printf("  -version ............... print version number and exit\n");
 #ifndef WEBP_DLL
-  printf("  -noasm ................. disable all assembly optimizations.\n");
+  printf("  -noasm ................. disable all assembly optimizations\n");
 #endif
  printf("  -v ..................... verbose, e.g. print encoding/decoding "
         "times\n");
  printf("  -progress .............. report encoding progress\n");
  printf("\n");
  printf("Experimental Options:\n");
-  printf("  -jpeg_like ............. Roughly match expected JPEG size.\n");
+  printf("  -jpeg_like ............. roughly match expected JPEG size\n");
-  printf("  -af .................... auto-adjust filter strength.\n");
+  printf("  -af .................... auto-adjust filter strength\n");
  printf("  -pre <int> ............. pre-processing filter\n");
  printf("\n");
 }
@ -627,7 +646,7 @@ static void HelpLong(void) {
 //------------------------------------------------------------------------------
 // Error messages
-static const char* const kErrorMessages[] = {
+static const char* const kErrorMessages[VP8_ENC_ERROR_LAST] = {
  "OK",
  "OUT_OF_MEMORY: Out of memory allocating objects",
  "BITSTREAM_OUT_OF_MEMORY: Out of memory re-allocating byte buffer",
@ -656,8 +675,14 @@ int main(int argc, const char *argv[]) {
  int short_output = 0;
  int quiet = 0;
  int keep_alpha = 1;
  int blend_alpha = 0;
  uint32_t background_color = 0xffffffu;
  int crop = 0, crop_x = 0, crop_y = 0, crop_w = 0, crop_h = 0;
  int resize_w = 0, resize_h = 0;
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
  int lossless_preset = 6;
  int use_lossless_preset = -1;  // -1=unset, 0=don't use, 1=use it
 #endif
  int show_progress = 0;
  int keep_metadata = 0;
  int metadata_written = 0;
@ -685,6 +710,7 @@ int main(int argc, const char *argv[]) {
  }
  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      HelpShort();
      return 0;
@ -706,20 +732,34 @@ int main(int argc, const char *argv[]) {
      config.show_compressed = 1;
      print_distortion = 2;
    } else if (!strcmp(argv[c], "-short")) {
-      short_output++;
+      ++short_output;
    } else if (!strcmp(argv[c], "-s") && c < argc - 2) {
-      picture.width = strtol(argv[++c], NULL, 0);
+      picture.width = ExUtilGetInt(argv[++c], 0, &parse_error);
-      picture.height = strtol(argv[++c], NULL, 0);
+      picture.height = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
-      config.method = strtol(argv[++c], NULL, 0);
+      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
 #endif
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
-      config.quality = (float)strtod(argv[++c], NULL);
+      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
    } else if (!strcmp(argv[c], "-z") && c < argc - 1) {
      lossless_preset = ExUtilGetInt(argv[++c], 0, &parse_error);
      if (use_lossless_preset != 0) use_lossless_preset = 1;
 #endif
    } else if (!strcmp(argv[c], "-alpha_q") && c < argc - 1) {
-      config.alpha_quality = strtol(argv[++c], NULL, 0);
+      config.alpha_quality = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_method") && c < argc - 1) {
-      config.alpha_compression = strtol(argv[++c], NULL, 0);
+      config.alpha_compression = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_cleanup")) {
      keep_alpha = keep_alpha ? 2 : 0;
    } else if (!strcmp(argv[c], "-blend_alpha") && c < argc - 1) {
      blend_alpha = 1;
      // background color is given in hex with an optional '0x' prefix
      background_color = ExUtilGetInt(argv[++c], 16, &parse_error);
      background_color = background_color & 0x00ffffffu;
    } else if (!strcmp(argv[c], "-alpha_filter") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "none")) {
@ -736,7 +776,6 @@ int main(int argc, const char *argv[]) {
      keep_alpha = 0;
    } else if (!strcmp(argv[c], "-lossless")) {
      config.lossless = 1;
      picture.use_argb = 1;
    } else if (!strcmp(argv[c], "-hint") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "photo")) {
@ -750,13 +789,13 @@ int main(int argc, const char *argv[]) {
        goto Error;
      }
    } else if (!strcmp(argv[c], "-size") && c < argc - 1) {
-      config.target_size = strtol(argv[++c], NULL, 0);
+      config.target_size = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-psnr") && c < argc - 1) {
-      config.target_PSNR = (float)strtod(argv[++c], NULL);
+      config.target_PSNR = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-sns") && c < argc - 1) {
-      config.sns_strength = strtol(argv[++c], NULL, 0);
+      config.sns_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
-      config.filter_strength = strtol(argv[++c], NULL, 0);
+      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-af")) {
      config.autofilter = 1;
    } else if (!strcmp(argv[c], "-jpeg_like")) {
@ -770,34 +809,26 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-nostrong")) {
      config.filter_type = 0;
    } else if (!strcmp(argv[c], "-sharpness") && c < argc - 1) {
-      config.filter_sharpness = strtol(argv[++c], NULL, 0);
+      config.filter_sharpness = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-pass") && c < argc - 1) {
-      config.pass = strtol(argv[++c], NULL, 0);
+      config.pass = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-pre") && c < argc - 1) {
-      config.preprocessing = strtol(argv[++c], NULL, 0);
+      config.preprocessing = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-segments") && c < argc - 1) {
-      config.segments = strtol(argv[++c], NULL, 0);
+      config.segments = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-partition_limit") && c < argc - 1) {
-      config.partition_limit = strtol(argv[++c], NULL, 0);
+      config.partition_limit = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-map") && c < argc - 1) {
-      picture.extra_info_type = strtol(argv[++c], NULL, 0);
+      picture.extra_info_type = ExUtilGetInt(argv[++c], 0, &parse_error);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
    } else if (!strcmp(argv[c], "-444")) {
      picture.colorspace = WEBP_YUV444;
    } else if (!strcmp(argv[c], "-422")) {
      picture.colorspace = WEBP_YUV422;
    } else if (!strcmp(argv[c], "-gray")) {
      picture.colorspace = WEBP_YUV400;
 #endif
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      crop = 1;
-      crop_x = strtol(argv[++c], NULL, 0);
+      crop_x = ExUtilGetInt(argv[++c], 0, &parse_error);
-      crop_y = strtol(argv[++c], NULL, 0);
+      crop_y = ExUtilGetInt(argv[++c], 0, &parse_error);
-      crop_w = strtol(argv[++c], NULL, 0);
+      crop_w = ExUtilGetInt(argv[++c], 0, &parse_error);
-      crop_h = strtol(argv[++c], NULL, 0);
+      crop_h = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-resize") && c < argc - 2) {
-      resize_w = strtol(argv[++c], NULL, 0);
+      resize_w = ExUtilGetInt(argv[++c], 0, &parse_error);
-      resize_h = strtol(argv[++c], NULL, 0);
+      resize_h = ExUtilGetInt(argv[++c], 0, &parse_error);
 #ifndef WEBP_DLL
    } else if (!strcmp(argv[c], "-noasm")) {
      VP8GetCPUInfo = NULL;
@ -882,6 +913,9 @@ int main(int argc, const char *argv[]) {
 #endif
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
    } else if (!strcmp(argv[c], "--")) {
      if (c < argc - 1) in_file = argv[++c];
      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
      HelpLong();
@ -889,6 +923,11 @@ int main(int argc, const char *argv[]) {
    } else {
      in_file = argv[c];
    }
    if (parse_error) {
      HelpLong();
      return -1;
    }
  }
  if (in_file == NULL) {
    fprintf(stderr, "No input file specified!\n");
@ -896,6 +935,15 @@ int main(int argc, const char *argv[]) {
    goto Error;
  }
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
  if (use_lossless_preset == 1) {
    if (!WebPConfigLosslessPreset(&config, lossless_preset)) {
      fprintf(stderr, "Invalid lossless preset (-z %d)\n", lossless_preset);
      goto Error;
    }
  }
 #endif
  // Check for unsupported command line options for lossless mode and log
  // warning for such options.
  if (!quiet && config.lossless == 1) {
@ -916,7 +964,7 @@ int main(int argc, const char *argv[]) {
  // Read the input
  if (verbose) {
-    StopwatchReadAndReset(&stop_watch);
+    StopwatchReset(&stop_watch);
  }
  if (!ReadPicture(in_file, &picture, keep_alpha,
                   (keep_metadata == 0) ? NULL : &metadata)) {
@ -924,6 +972,11 @@ int main(int argc, const char *argv[]) {
    goto Error;
  }
  picture.progress_hook = (show_progress && !quiet) ? ProgressReport : NULL;
  if (blend_alpha) {
    WebPBlendAlpha(&picture, background_color);
  }
  if (keep_alpha == 2) {
    WebPCleanupTransparentArea(&picture);
  }
@ -934,8 +987,9 @@ int main(int argc, const char *argv[]) {
  }
  // Open the output
-  if (out_file) {
+  if (out_file != NULL) {
-    out = fopen(out_file, "wb");
+    const int use_stdout = !strcmp(out_file, "-");
    out = use_stdout ? ExUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
    if (out == NULL) {
      fprintf(stderr, "Error! Cannot open output file '%s'\n", out_file);
      goto Error;
@ -965,7 +1019,7 @@ int main(int argc, const char *argv[]) {
  // Compress
  if (verbose) {
-    StopwatchReadAndReset(&stop_watch);
+    StopwatchReset(&stop_watch);
  }
  if (crop != 0) {
    // We use self-cropping using a view.
@ -1006,42 +1060,90 @@ int main(int argc, const char *argv[]) {
    }
  }
-  if (keep_metadata != 0 && out != NULL) {
+  if (keep_metadata != 0) {
    if (out != NULL) {
      if (!WriteWebPWithMetadata(out, &picture, &memory_writer,
                                 &metadata, keep_metadata, &metadata_written)) {
        fprintf(stderr, "Error writing WebP file with metadata!\n");
        goto Error;
      }
    } else {  // output is disabled, just display the metadata stats.
      const struct {
        const MetadataPayload* const payload;
        int flag;
      } *iter, info[] = {
        { &metadata.exif, METADATA_EXIF },
        { &metadata.iccp, METADATA_ICC },
        { &metadata.xmp, METADATA_XMP },
        { NULL, 0 }
      };
      uint32_t unused1 = 0;
      uint64_t unused2 = 0;
      for (iter = info; iter->payload != NULL; ++iter) {
        if (UpdateFlagsAndSize(iter->payload, !!(keep_metadata & iter->flag),
                               0, &unused1, &unused2)) {
          metadata_written |= iter->flag;
        }
      }
    }
  }
  if (!quiet) {
    if (!short_output || print_distortion < 0) {
      if (config.lossless) {
        PrintExtraInfoLossless(&picture, short_output, in_file);
      } else {
        PrintExtraInfoLossy(&picture, short_output, config.low_memory, in_file);
      }
    }
    if (!short_output && picture.extra_info_type > 0) {
      PrintMapInfo(&picture);
    }
    if (print_distortion >= 0) {    // print distortion
      static const char* distortion_names[] = { "PSNR", "SSIM", "LSIM" };
      float values[5];
      // Comparison is performed in YUVA colorspace.
      if (original_picture.use_argb &&
          !WebPPictureARGBToYUVA(&original_picture, WEBP_YUV420A)) {
       fprintf(stderr, "Error while converting original picture to YUVA.\n");
        goto Error;
      }
      if (picture.use_argb &&
          !WebPPictureARGBToYUVA(&picture, WEBP_YUV420A)) {
        fprintf(stderr, "Error while converting compressed picture to YUVA.\n");
        goto Error;
      }
      if (!WebPPictureDistortion(&picture, &original_picture,
                                 print_distortion, values)) {
        fprintf(stderr, "Error while computing the distortion.\n");
        goto Error;
      }
      if (!short_output) {
        fprintf(stderr, "%s: Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
                distortion_names[print_distortion],
                values[0], values[1], values[2], values[3], values[4]);
      } else {
        fprintf(stderr, "%7d %.4f\n", picture.stats->coded_size, values[4]);
      }
    }
    if (!short_output) {
      PrintMetadataInfo(&metadata, metadata_written);
    }
  }
  if (!quiet && !short_output && print_distortion >= 0) {  // print distortion
    static const char* distortion_names[] = { "PSNR", "SSIM", "LSIM" };
    float values[5];
    WebPPictureDistortion(&picture, &original_picture,
                          print_distortion, values);
    fprintf(stderr, "%s: Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
            distortion_names[print_distortion],
            values[0], values[1], values[2], values[3], values[4]);
  }
  return_value = 0;
 Error:
 #if WEBP_ENCODER_ABI_VERSION > 0x0203
  WebPMemoryWriterClear(&memory_writer);
 #else
  free(memory_writer.mem);
 #endif
  free(picture.extra_info);
  MetadataFree(&metadata);
  WebPPictureFree(&picture);
  WebPPictureFree(&original_picture);
-  if (out != NULL) {
+  if (out != NULL && out != stdout) {
    fclose(out);
  }
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -1,13 +1,13 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Command-line tool for decoding a WebP image
+//  Command-line tool for decoding a WebP image.
 //
 //  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
 //
 // Author: Skal (pascal.massimino@gmail.com)
@ -17,11 +17,12 @@
 #include <string.h>
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #ifdef WEBP_HAVE_PNG
 #include <png.h>
 #include <setjmp.h>   // note: this must be included *after* png.h
 #endif
 #ifdef HAVE_WINCODEC_H
@ -32,6 +33,7 @@
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
                         // code with COBJMACROS.
 #include <ole2.h>  // CreateStreamOnHGlobal()
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>
@ -43,13 +45,13 @@
 static int verbose = 0;
 #ifndef WEBP_DLL
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 extern void* VP8GetCPUInfo;   // opaque forward declaration.
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_DLL
@ -62,6 +64,8 @@ typedef enum {
  PAM,
  PPM,
  PGM,
  BMP,
  TIFF,
  YUV,
  ALPHA_PLANE_ONLY  // this is for experimenting only
 } OutputFileFormat;
@ -82,9 +86,15 @@ typedef enum {
 #define MAKE_REFGUID(x) &(x)
 #endif
-static HRESULT CreateOutputStream(const char* out_file_name, IStream** stream) {
+static HRESULT CreateOutputStream(const char* out_file_name,
                                  int write_to_mem, IStream** stream) {
  HRESULT hr = S_OK;
  if (write_to_mem) {
    // Output to a memory buffer. This is freed when 'stream' is released.
    IFS(CreateStreamOnHGlobal(NULL, TRUE, stream));
  } else {
    IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, stream));
  }
  if (FAILED(hr)) {
    fprintf(stderr, "Error opening output file %s (%08lx)\n",
            out_file_name, hr);
@ -92,8 +102,9 @@ static HRESULT CreateOutputStream(const char* out_file_name, IStream** stream) {
  return hr;
 }
-static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
+static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
-                             unsigned char* rgb, int stride,
+                             REFGUID container_guid,
                             uint8_t* rgb, int stride,
                             uint32_t width, uint32_t height, int has_alpha) {
  HRESULT hr = S_OK;
  IWICImagingFactory* factory = NULL;
@ -114,7 +125,7 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
            "Windows XP SP3 or newer?). PNG support not available. "
            "Use -ppm or -pgm for available PPM and PGM formats.\n");
  }
-  IFS(CreateOutputStream(out_file_name, &stream));
+  IFS(CreateOutputStream(out_file_name, use_stdout, &stream));
  IFS(IWICImagingFactory_CreateEncoder(factory, container_guid, NULL,
                                       &encoder));
  IFS(IWICBitmapEncoder_Initialize(encoder, stream,
@ -128,6 +139,28 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
  IFS(IWICBitmapFrameEncode_Commit(frame));
  IFS(IWICBitmapEncoder_Commit(encoder));
  if (SUCCEEDED(hr) && use_stdout) {
    HGLOBAL image;
    IFS(GetHGlobalFromStream(stream, &image));
    if (SUCCEEDED(hr)) {
      HANDLE std_output = GetStdHandle(STD_OUTPUT_HANDLE);
      DWORD mode;
      const BOOL update_mode = GetConsoleMode(std_output, &mode);
      const void* const image_mem = GlobalLock(image);
      DWORD bytes_written = 0;
      // Clear output processing if necessary, then output the image.
      if (update_mode) SetConsoleMode(std_output, 0);
      if (!WriteFile(std_output, image_mem, (DWORD)GlobalSize(image),
                     &bytes_written, NULL) ||
          bytes_written != GlobalSize(image)) {
        hr = E_FAIL;
      }
      if (update_mode) SetConsoleMode(std_output, mode);
      GlobalUnlock(image);
    }
  }
  if (frame != NULL) IUnknown_Release(frame);
  if (encoder != NULL) IUnknown_Release(encoder);
  if (factory != NULL) IUnknown_Release(factory);
@ -135,21 +168,21 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
  return hr;
 }
-static int WritePNG(const char* out_file_name,
+static int WritePNG(const char* out_file_name, int use_stdout,
                    const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_BGRA);
-  return SUCCEEDED(WriteUsingWIC(out_file_name,
+  return SUCCEEDED(WriteUsingWIC(out_file_name, use_stdout,
                                 MAKE_REFGUID(GUID_ContainerFormatPng),
                                 rgb, stride, width, height, has_alpha));
 }
 #elif defined(WEBP_HAVE_PNG)    // !HAVE_WINCODEC_H
-static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
+static void PNGAPI PNGErrorFunction(png_structp png, png_const_charp dummy) {
  (void)dummy;  // remove variable-unused warning
  longjmp(png_jmpbuf(png), 1);
 }
@ -157,25 +190,25 @@ static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
 static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_RGBA);
-  png_structp png;
+  volatile png_structp png;
-  png_infop info;
+  volatile png_infop info;
  png_uint_32 y;
  png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
-                                NULL, error_function, NULL);
+                                NULL, PNGErrorFunction, NULL);
  if (png == NULL) {
    return 0;
  }
  info = png_create_info_struct(png);
  if (info == NULL) {
-    png_destroy_write_struct(&png, NULL);
+    png_destroy_write_struct((png_structpp)&png, NULL);
    return 0;
  }
  if (setjmp(png_jmpbuf(png))) {
-    png_destroy_write_struct(&png, &info);
+    png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
    return 0;
  }
  png_init_io(png, out_file);
@ -189,7 +222,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
    png_write_rows(png, &row, 1);
  }
  png_write_end(png, info);
-  png_destroy_write_struct(&png, &info);
+  png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
  return 1;
 }
 #else    // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG
@ -206,16 +239,16 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
 static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  const unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const size_t bytes_per_px = alpha ? 4 : 3;
  uint32_t y;
  if (alpha) {
-    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+    fprintf(fout, "P7\nWIDTH %u\nHEIGHT %u\nDEPTH 4\nMAXVAL 255\n"
                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
  } else {
-    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+    fprintf(fout, "P6\n%u %u\n255\n", width, height);
  }
  for (y = 0; y < height; ++y) {
    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
@ -225,14 +258,154 @@ static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  return 1;
 }
 static void PutLE16(uint8_t* const dst, uint32_t value) {
  dst[0] = (value >> 0) & 0xff;
  dst[1] = (value >> 8) & 0xff;
 }
 static void PutLE32(uint8_t* const dst, uint32_t value) {
  PutLE16(dst + 0, (value >>  0) & 0xffff);
  PutLE16(dst + 2, (value >> 16) & 0xffff);
 }
 #define BMP_HEADER_SIZE 54
 static int WriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
  const int has_alpha = (buffer->colorspace != MODE_BGR);
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
  const uint8_t* const rgba = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const uint32_t bytes_per_px = has_alpha ? 4 : 3;
  uint32_t y;
  const uint32_t line_size = bytes_per_px * width;
  const uint32_t bmp_stride = (line_size + 3) & ~3;   // pad to 4
  const uint32_t total_size = bmp_stride * height + BMP_HEADER_SIZE;
  uint8_t bmp_header[BMP_HEADER_SIZE] = { 0 };
  // bitmap file header
  PutLE16(bmp_header + 0, 0x4d42);                // signature 'BM'
  PutLE32(bmp_header + 2, total_size);            // size including header
  PutLE32(bmp_header + 6, 0);                     // reserved
  PutLE32(bmp_header + 10, BMP_HEADER_SIZE);      // offset to pixel array
  // bitmap info header
  PutLE32(bmp_header + 14, 40);                   // DIB header size
  PutLE32(bmp_header + 18, width);                // dimensions
  PutLE32(bmp_header + 22, -(int)height);         // vertical flip!
  PutLE16(bmp_header + 26, 1);                    // number of planes
  PutLE16(bmp_header + 28, bytes_per_px * 8);     // bits per pixel
  PutLE32(bmp_header + 30, 0);                    // no compression (BI_RGB)
  PutLE32(bmp_header + 34, 0);                    // image size (dummy)
  PutLE32(bmp_header + 38, 2400);                 // x pixels/meter
  PutLE32(bmp_header + 42, 2400);                 // y pixels/meter
  PutLE32(bmp_header + 46, 0);                    // number of palette colors
  PutLE32(bmp_header + 50, 0);                    // important color count
  // TODO(skal): color profile
  // write header
  if (fwrite(bmp_header, sizeof(bmp_header), 1, fout) != 1) {
    return 0;
  }
  // write pixel array
  for (y = 0; y < height; ++y) {
    if (fwrite(rgba + y * stride, line_size, 1, fout) != 1) {
      return 0;
    }
    // write padding zeroes
    if (bmp_stride != line_size) {
      const uint8_t zeroes[3] = { 0 };
      if (fwrite(zeroes, bmp_stride - line_size, 1, fout) != 1) {
        return 0;
      }
    }
  }
  return 1;
 }
 #undef BMP_HEADER_SIZE
 #define NUM_IFD_ENTRIES 15
 #define EXTRA_DATA_SIZE 16
 // 10b for signature/header + n * 12b entries + 4b for IFD terminator:
 #define EXTRA_DATA_OFFSET (10 + 12 * NUM_IFD_ENTRIES + 4)
 #define TIFF_HEADER_SIZE (EXTRA_DATA_OFFSET + EXTRA_DATA_SIZE)
 static int WriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
  const int has_alpha = (buffer->colorspace != MODE_RGB);
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
  const uint8_t* const rgba = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
                                            : NUM_IFD_ENTRIES - 1;
  uint8_t tiff_header[TIFF_HEADER_SIZE] = {
    0x49, 0x49, 0x2a, 0x00,   // little endian signature
    8, 0, 0, 0,               // offset to the unique IFD that follows
    // IFD (offset = 8). Entries must be written in increasing tag order.
    num_ifd_entries, 0,       // Number of entries in the IFD (12 bytes each).
    0x00, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  10: Width  (TBD)
    0x01, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  22: Height (TBD)
    0x02, 0x01, 3, 0, bytes_per_px, 0, 0, 0,     //  34: BitsPerSample: 8888
        EXTRA_DATA_OFFSET + 0, 0, 0, 0,
    0x03, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  46: Compression: none
    0x06, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    //  58: Photometric: RGB
    0x11, 0x01, 4, 0, 1, 0, 0, 0,                //  70: Strips offset:
        TIFF_HEADER_SIZE, 0, 0, 0,               //      data follows header
    0x12, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  82: Orientation: topleft
    0x15, 0x01, 3, 0, 1, 0, 0, 0,                //  94: SamplesPerPixels
        bytes_per_px, 0, 0, 0,
    0x16, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 106: Rows per strip (TBD)
    0x17, 0x01, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 118: StripByteCount (TBD)
    0x1a, 0x01, 5, 0, 1, 0, 0, 0,                // 130: X-resolution
        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
    0x1b, 0x01, 5, 0, 1, 0, 0, 0,                // 142: Y-resolution
        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
    0x1c, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 154: PlanarConfiguration
    0x28, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    // 166: ResolutionUnit (inch)
    0x52, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 178: ExtraSamples: rgbA
    0, 0, 0, 0,                                  // 190: IFD terminator
    // EXTRA_DATA_OFFSET:
    8, 0, 8, 0, 8, 0, 8, 0,      // BitsPerSample
    72, 0, 0, 0, 1, 0, 0, 0      // 72 pixels/inch, for X/Y-resolution
  };
  uint32_t y;
  // Fill placeholders in IFD:
  PutLE32(tiff_header + 10 + 8, width);
  PutLE32(tiff_header + 22 + 8, height);
  PutLE32(tiff_header + 106 + 8, height);
  PutLE32(tiff_header + 118 + 8, width * bytes_per_px * height);
  if (!has_alpha) PutLE32(tiff_header + 178, 0);  // IFD terminator
  // write header
  if (fwrite(tiff_header, sizeof(tiff_header), 1, fout) != 1) {
    return 0;
  }
  // write pixel values
  for (y = 0; y < height; ++y) {
    if (fwrite(rgba + y * stride, bytes_per_px, width, fout) != width) {
      return 0;
    }
  }
  return 1;
 }
 #undef TIFF_HEADER_SIZE
 #undef EXTRA_DATA_OFFSET
 #undef EXTRA_DATA_SIZE
 #undef NUM_IFD_ENTRIES
 static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  const unsigned char* const a = buffer->u.YUVA.a;
+  const uint8_t* const a = buffer->u.YUVA.a;
  const int a_stride = buffer->u.YUVA.a_stride;
  uint32_t y;
  assert(a != NULL);
-  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  fprintf(fout, "P5\n%u %u\n255\n", width, height);
  for (y = 0; y < height; ++y) {
    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
      return 0;
@ -289,30 +462,33 @@ static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
  return ok;
 }
-static void SaveOutput(const WebPDecBuffer* const buffer,
+static int SaveOutput(const WebPDecBuffer* const buffer,
                      OutputFileFormat format, const char* const out_file) {
  FILE* fout = NULL;
  int needs_open_file = 1;
  const int use_stdout = !strcmp(out_file, "-");
  int ok = 1;
  Stopwatch stop_watch;
-  if (verbose)
+  if (verbose) {
-    StopwatchReadAndReset(&stop_watch);
+    StopwatchReset(&stop_watch);
  }
 #ifdef HAVE_WINCODEC_H
  needs_open_file = (format != PNG);
 #endif
  if (needs_open_file) {
-    fout = fopen(out_file, "wb");
+    fout = use_stdout ? ExUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
-    if (!fout) {
+    if (fout == NULL) {
      fprintf(stderr, "Error opening output file %s\n", out_file);
-      return;
+      return 0;
    }
  }
  if (format == PNG) {
 #ifdef HAVE_WINCODEC_H
-    ok &= WritePNG(out_file, buffer);
+    ok &= WritePNG(out_file, use_stdout, buffer);
 #else
    ok &= WritePNG(fout, buffer);
 #endif
@ -320,24 +496,37 @@ static void SaveOutput(const WebPDecBuffer* const buffer,
    ok &= WritePPM(fout, buffer, 1);
  } else if (format == PPM) {
    ok &= WritePPM(fout, buffer, 0);
  } else if (format == BMP) {
    ok &= WriteBMP(fout, buffer);
  } else if (format == TIFF) {
    ok &= WriteTIFF(fout, buffer);
  } else if (format == PGM || format == YUV) {
    ok &= WritePGMOrYUV(fout, buffer, format);
  } else if (format == ALPHA_PLANE_ONLY) {
    ok &= WriteAlphaPlane(fout, buffer);
  }
-  if (fout) {
+  if (fout != NULL && fout != stdout) {
    fclose(fout);
  }
  if (ok) {
-    printf("Saved file %s\n", out_file);
+    if (use_stdout) {
      fprintf(stderr, "Saved to stdout\n");
    } else {
      fprintf(stderr, "Saved file %s\n", out_file);
    }
    if (verbose) {
      const double write_time = StopwatchReadAndReset(&stop_watch);
-      printf("Time to write output: %.3fs\n", write_time);
+      fprintf(stderr, "Time to write output: %.3fs\n", write_time);
    }
  } else {
    if (use_stdout) {
      fprintf(stderr, "Error writing to stdout !!\n");
    } else {
      fprintf(stderr, "Error writing file %s !!\n", out_file);
    }
  }
  return ok;
 }
 static void Help(void) {
  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
@ -345,32 +534,43 @@ static void Help(void) {
         "Use following options to convert into alternate image formats:\n"
         "  -pam ......... save the raw RGBA samples as a color PAM\n"
         "  -ppm ......... save the raw RGB samples as a color PPM\n"
         "  -bmp ......... save as uncompressed BMP format\n"
         "  -tiff ........ save as uncompressed TIFF format\n"
         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
-         "                 file with IMC4 layout.\n"
+         "                 file with IMC4 layout\n"
-         "  -yuv ......... save the raw YUV samples in flat layout.\n"
+         "  -yuv ......... save the raw YUV samples in flat layout\n"
         "\n"
         " Other options are:\n"
-         "  -version  .... print version number and exit.\n"
+         "  -version  .... print version number and exit\n"
-         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
-         "  -nofilter .... disable in-loop filtering.\n"
+         "  -nofilter .... disable in-loop filtering\n"
         "  -nodither .... disable dithering\n"
         "  -dither <d> .. dithering strength (in 0..100)\n"
 #if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -alpha_dither  use alpha-plane dithering if needed\n"
 #endif
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
-         "  -alpha ....... only save the alpha plane.\n"
+#if WEBP_DECODER_ABI_VERSION > 0x0203
-         "  -h     ....... this help message.\n"
+         "  -flip ........ flip the output vertically\n"
 #endif
         "  -alpha ....... only save the alpha plane\n"
         "  -incremental . use incremental decoding (useful for tests)\n"
         "  -h     ....... this help message\n"
         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
 #ifndef WEBP_DLL
-         "  -noasm ....... disable all assembly optimizations.\n"
+         "  -noasm ....... disable all assembly optimizations\n"
 #endif
        );
 }
-static const char* const kStatusMessages[] = {
+static const char* const kFormatType[] = {
-  "OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
+  "unspecified", "lossy", "lossless"
  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
 };
 int main(int argc, const char *argv[]) {
  int ok = 0;
  const char *in_file = NULL;
  const char *out_file = NULL;
@ -378,6 +578,7 @@ int main(int argc, const char *argv[]) {
  WebPDecBuffer* const output_buffer = &config.output;
  WebPBitstreamFeatures* const bitstream = &config.input;
  OutputFileFormat format = PNG;
  int incremental = 0;
  int c;
  if (!WebPInitDecoderConfig(&config)) {
@ -386,6 +587,7 @@ int main(int argc, const char *argv[]) {
  }
  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
@ -401,6 +603,10 @@ int main(int argc, const char *argv[]) {
      format = PAM;
    } else if (!strcmp(argv[c], "-ppm")) {
      format = PPM;
    } else if (!strcmp(argv[c], "-bmp")) {
      format = BMP;
    } else if (!strcmp(argv[c], "-tiff")) {
      format = TIFF;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
@ -412,22 +618,40 @@ int main(int argc, const char *argv[]) {
      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
 #if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-alpha_dither")) {
      config.options.alpha_dithering_strength = 100;
 #endif
    } else if (!strcmp(argv[c], "-nodither")) {
      config.options.dithering_strength = 0;
    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
      config.options.dithering_strength =
          ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      config.options.use_cropping = 1;
-      config.options.crop_left   = strtol(argv[++c], NULL, 0);
+      config.options.crop_left   = ExUtilGetInt(argv[++c], 0, &parse_error);
-      config.options.crop_top    = strtol(argv[++c], NULL, 0);
+      config.options.crop_top    = ExUtilGetInt(argv[++c], 0, &parse_error);
-      config.options.crop_width  = strtol(argv[++c], NULL, 0);
+      config.options.crop_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
-      config.options.crop_height = strtol(argv[++c], NULL, 0);
+      config.options.crop_height = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
      config.options.use_scaling = 1;
-      config.options.scaled_width  = strtol(argv[++c], NULL, 0);
+      config.options.scaled_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
-      config.options.scaled_height = strtol(argv[++c], NULL, 0);
+      config.options.scaled_height = ExUtilGetInt(argv[++c], 0, &parse_error);
 #if WEBP_DECODER_ABI_VERSION > 0x0203
    } else if (!strcmp(argv[c], "-flip")) {
      config.options.flip = 1;
 #endif
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
 #ifndef WEBP_DLL
    } else if (!strcmp(argv[c], "-noasm")) {
      VP8GetCPUInfo = NULL;
 #endif
    } else if (!strcmp(argv[c], "-incremental")) {
      incremental = 1;
    } else if (!strcmp(argv[c], "--")) {
      if (c < argc - 1) in_file = argv[++c];
      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Unknown option '%s'\n", argv[c]);
      Help();
@ -435,6 +659,11 @@ int main(int argc, const char *argv[]) {
    } else {
      in_file = argv[c];
    }
    if (parse_error) {
      Help();
      return -1;
    }
  }
  if (in_file == NULL) {
@ -444,27 +673,11 @@ int main(int argc, const char *argv[]) {
  }
  {
    Stopwatch stop_watch;
    VP8StatusCode status = VP8_STATUS_OK;
    int ok;
    size_t data_size = 0;
    const uint8_t* data = NULL;
-
+    if (!ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
-    if (!ExUtilReadFile(in_file, &data, &data_size)) return -1;
+      return -1;
    if (verbose)
      StopwatchReadAndReset(&stop_watch);
    status = WebPGetFeatures(data, data_size, bitstream);
    if (status != VP8_STATUS_OK) {
      goto end;
    }
    if (bitstream->has_animation) {
      fprintf(stderr,
              "Error! Decoding of an animated WebP file is not supported.\n"
              "       Use webpmux to extract the individual frames or\n"
              "       vwebp to view this image.\n");
    }
    switch (format) {
@ -481,6 +694,13 @@ int main(int argc, const char *argv[]) {
      case PPM:
        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
        break;
      case BMP:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
        break;
      case TIFF:    // note: force pre-multiplied alpha
        output_buffer->colorspace =
            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
        break;
      case PGM:
      case YUV:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
@ -492,36 +712,40 @@ int main(int argc, const char *argv[]) {
        free((void*)data);
        return -1;
    }
    status = WebPDecode(data, data_size, &config);
-    if (verbose) {
+    if (incremental) {
-      const double decode_time = StopwatchReadAndReset(&stop_watch);
+      status = ExUtilDecodeWebPIncremental(data, data_size, verbose, &config);
-      printf("Time to decode picture: %.3fs\n", decode_time);
+    } else {
      status = ExUtilDecodeWebP(data, data_size, verbose, &config);
    }
- end:
+
    free((void*)data);
    ok = (status == VP8_STATUS_OK);
    if (!ok) {
-      fprintf(stderr, "Decoding of %s failed.\n", in_file);
+      ExUtilPrintWebPError(in_file, status);
-      fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
+      goto Exit;
      return -1;
    }
  }
-  if (out_file) {
+  if (out_file != NULL) {
-    printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
+    fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
-           output_buffer->width, output_buffer->height,
+                    "Now saving...\n",
           bitstream->has_alpha ? " (with alpha)" : "");
    SaveOutput(output_buffer, format, out_file);
  } else {
    printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
            in_file, output_buffer->width, output_buffer->height,
-           bitstream->has_alpha ? " (with alpha)" : "");
+            bitstream->has_alpha ? " (with alpha)" : "",
-    printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
+            kFormatType[bitstream->format]);
    ok = SaveOutput(output_buffer, format, out_file);
  } else {
    fprintf(stderr, "File %s can be decoded "
                    "(dimensions: %d x %d %s. Format: %s).\n",
            in_file, output_buffer->width, output_buffer->height,
            bitstream->has_alpha ? " (with alpha)" : "",
            kFormatType[bitstream->format]);
    fprintf(stderr, "Nothing written; "
                    "use -o flag to save the result as e.g. PNG.\n");
  }
 Exit:
  WebPFreeDecBuffer(output_buffer);
-
+  return ok ? 0 : -1;
  return 0;
 }
 //------------------------------------------------------------------------------
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -1,32 +1,114 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Utility functions used by the example programs.
 //
 #include "./example_util.h"
 #if defined(_WIN32)
 #include <fcntl.h>   // for _O_BINARY
 #include <io.h>      // for _setmode()
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#if defined(__cplusplus) || defined(c_plusplus)
+#include "webp/decode.h"
-extern "C" {
+#include "./stopwatch.h"
-#endif
+
 //------------------------------------------------------------------------------
 // String parsing
 uint32_t ExUtilGetUInt(const char* const v, int base, int* const error) {
  char* end = NULL;
  const uint32_t n = (v != NULL) ? (uint32_t)strtoul(v, &end, base) : 0u;
  if (end == v && error != NULL && !*error) {
    *error = 1;
    fprintf(stderr, "Error! '%s' is not an integer.\n",
            (v != NULL) ? v : "(null)");
  }
  return n;
 }
 int ExUtilGetInt(const char* const v, int base, int* const error) {
  return (int)ExUtilGetUInt(v, base, error);
 }
 float ExUtilGetFloat(const char* const v, int* const error) {
  char* end = NULL;
  const float f = (v != NULL) ? (float)strtod(v, &end) : 0.f;
  if (end == v && error != NULL && !*error) {
    *error = 1;
    fprintf(stderr, "Error! '%s' is not a floating point number.\n",
            (v != NULL) ? v : "(null)");
  }
  return f;
 }
 // -----------------------------------------------------------------------------
 // File I/O
 FILE* ExUtilSetBinaryMode(FILE* file) {
 #if defined(_WIN32)
  if (_setmode(_fileno(file), _O_BINARY) == -1) {
    fprintf(stderr, "Failed to reopen file in O_BINARY mode.\n");
    return NULL;
  }
 #endif
  return file;
 }
 int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
  static const size_t kBlockSize = 16384;  // default initial size
  size_t max_size = 0;
  size_t size = 0;
  uint8_t* input = NULL;
  if (data == NULL || data_size == NULL) return 0;
  *data = NULL;
  *data_size = 0;
  if (!ExUtilSetBinaryMode(stdin)) return 0;
  while (!feof(stdin)) {
    // We double the buffer size each time and read as much as possible.
    const size_t extra_size = (max_size == 0) ? kBlockSize : max_size;
    void* const new_data = realloc(input, max_size + extra_size);
    if (new_data == NULL) goto Error;
    input = (uint8_t*)new_data;
    max_size += extra_size;
    size += fread(input + size, 1, extra_size, stdin);
    if (size < max_size) break;
  }
  if (ferror(stdin)) goto Error;
  *data = input;
  *data_size = size;
  return 1;
 Error:
  free(input);
  fprintf(stderr, "Could not read from stdin\n");
  return 0;
 }
 int ExUtilReadFile(const char* const file_name,
                   const uint8_t** data, size_t* data_size) {
  int ok;
  void* file_data;
  size_t file_size;
  FILE* in;
  const int from_stdin = (file_name == NULL) || !strcmp(file_name, "-");
-  if (file_name == NULL || data == NULL || data_size == NULL) return 0;
+  if (from_stdin) return ExUtilReadFromStdin(data, data_size);
  if (data == NULL || data_size == NULL) return 0;
  *data = NULL;
  *data_size = 0;
@ -58,20 +140,119 @@ int ExUtilWriteFile(const char* const file_name,
                    const uint8_t* data, size_t data_size) {
  int ok;
  FILE* out;
  const int to_stdout = (file_name == NULL) || !strcmp(file_name, "-");
-  if (file_name == NULL || data == NULL) {
+  if (data == NULL) {
    return 0;
  }
-  out = fopen(file_name, "wb");
+  out = to_stdout ? stdout : fopen(file_name, "wb");
  if (out == NULL) {
    fprintf(stderr, "Error! Cannot open output file '%s'\n", file_name);
    return 0;
  }
  ok = (fwrite(data, data_size, 1, out) == 1);
-  fclose(out);
+  if (out != stdout) fclose(out);
  return ok;
 }
-#if defined(__cplusplus) || defined(c_plusplus)
+//------------------------------------------------------------------------------
-}    // extern "C"
+// WebP decoding
-#endif
+
 static const char* const kStatusMessages[VP8_STATUS_NOT_ENOUGH_DATA + 1] = {
  "OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
 };
 static void PrintAnimationWarning(const WebPDecoderConfig* const config) {
  if (config->input.has_animation) {
    fprintf(stderr,
            "Error! Decoding of an animated WebP file is not supported.\n"
            "       Use webpmux to extract the individual frames or\n"
            "       vwebp to view this image.\n");
  }
 }
 void ExUtilPrintWebPError(const char* const in_file, int status) {
  fprintf(stderr, "Decoding of %s failed.\n", in_file);
  fprintf(stderr, "Status: %d", status);
  if (status >= VP8_STATUS_OK && status <= VP8_STATUS_NOT_ENOUGH_DATA) {
    fprintf(stderr, "(%s)", kStatusMessages[status]);
  }
  fprintf(stderr, "\n");
 }
 int ExUtilLoadWebP(const char* const in_file,
                   const uint8_t** data, size_t* data_size,
                   WebPBitstreamFeatures* bitstream) {
  VP8StatusCode status;
  WebPBitstreamFeatures local_features;
  if (!ExUtilReadFile(in_file, data, data_size)) return 0;
  if (bitstream == NULL) {
    bitstream = &local_features;
  }
  status = WebPGetFeatures(*data, *data_size, bitstream);
  if (status != VP8_STATUS_OK) {
    free((void*)*data);
    *data = NULL;
    *data_size = 0;
    ExUtilPrintWebPError(in_file, status);
    return 0;
  }
  return 1;
 }
 //------------------------------------------------------------------------------
 VP8StatusCode ExUtilDecodeWebP(const uint8_t* const data, size_t data_size,
                               int verbose, WebPDecoderConfig* const config) {
  Stopwatch stop_watch;
  VP8StatusCode status = VP8_STATUS_OK;
  if (config == NULL) return VP8_STATUS_INVALID_PARAM;
  PrintAnimationWarning(config);
  StopwatchReset(&stop_watch);
  // Decoding call.
  status = WebPDecode(data, data_size, config);
  if (verbose) {
    const double decode_time = StopwatchReadAndReset(&stop_watch);
    fprintf(stderr, "Time to decode picture: %.3fs\n", decode_time);
  }
  return status;
 }
 VP8StatusCode ExUtilDecodeWebPIncremental(
    const uint8_t* const data, size_t data_size,
    int verbose, WebPDecoderConfig* const config) {
  Stopwatch stop_watch;
  VP8StatusCode status = VP8_STATUS_OK;
  if (config == NULL) return VP8_STATUS_INVALID_PARAM;
  PrintAnimationWarning(config);
  StopwatchReset(&stop_watch);
  // Decoding call.
  {
    WebPIDecoder* const idec = WebPIDecode(data, data_size, config);
    if (idec == NULL) {
      fprintf(stderr, "Failed during WebPINewDecoder().\n");
      return VP8_STATUS_OUT_OF_MEMORY;
    } else {
      status = WebPIUpdate(idec, data, data_size);
      WebPIDelete(idec);
    }
  }
  if (verbose) {
    const double decode_time = StopwatchReadAndReset(&stop_watch);
    fprintf(stderr, "Time to decode picture: %.3fs\n", decode_time);
  }
  return status;
 }
 // -----------------------------------------------------------------------------
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Utility functions used by the example programs.
@ -11,23 +13,76 @@
 #ifndef WEBP_EXAMPLES_EXAMPLE_UTIL_H_
 #define WEBP_EXAMPLES_EXAMPLE_UTIL_H_
-#include "webp/types.h"
+#include <stdio.h>
 #include "webp/decode.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // String parsing
 // Parses 'v' using strto(ul|l|d)(). If error is non-NULL, '*error' is set to
 // true on failure while on success it is left unmodified to allow chaining of
 // calls. An error is only printed on the first occurrence.
 uint32_t ExUtilGetUInt(const char* const v, int base, int* const error);
 int ExUtilGetInt(const char* const v, int base, int* const error);
 float ExUtilGetFloat(const char* const v, int* const error);
 //------------------------------------------------------------------------------
 // File I/O
 // Reopen file in binary (O_BINARY) mode.
 // Returns 'file' on success, NULL otherwise.
 FILE* ExUtilSetBinaryMode(FILE* file);
 // Allocates storage for entire file 'file_name' and returns contents and size
 // in 'data' and 'data_size'. Returns 1 on success, 0 otherwise. '*data' should
 // be deleted using free().
 // If 'file_name' is NULL or equal to "-", input is read from stdin by calling
 // the function ExUtilReadFromStdin().
 int ExUtilReadFile(const char* const file_name,
                   const uint8_t** data, size_t* data_size);
 // Same as ExUtilReadFile(), but reads until EOF from stdin instead.
 int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size);
 // Write a data segment into a file named 'file_name'. Returns true if ok.
 // If 'file_name' is NULL or equal to "-", output is written to stdout.
 int ExUtilWriteFile(const char* const file_name,
                    const uint8_t* data, size_t data_size);
-#if defined(__cplusplus) || defined(c_plusplus)
+//------------------------------------------------------------------------------
 // WebP decoding
 // Prints an informative error message regarding decode failure of 'in_file'.
 // 'status' is treated as a VP8StatusCode and if valid will be printed as a
 // text string.
 void ExUtilPrintWebPError(const char* const in_file, int status);
 // Reads a WebP from 'in_file', returning the contents and size in 'data' and
 // 'data_size'. If not NULL, 'bitstream' is populated using WebPGetFeatures().
 // Returns true on success.
 int ExUtilLoadWebP(const char* const in_file,
                   const uint8_t** data, size_t* data_size,
                   WebPBitstreamFeatures* bitstream);
 // Decodes the WebP contained in 'data'.
 // 'config' is a structure previously initialized by WebPInitDecoderConfig().
 // 'config->output' should have the desired colorspace selected. 'verbose' will
 // cause decode timing to be reported.
 // Returns the decoder status. On success 'config->output' will contain the
 // decoded picture.
 VP8StatusCode ExUtilDecodeWebP(const uint8_t* const data, size_t data_size,
                               int verbose, WebPDecoderConfig* const config);
 // Same as ExUtilDecodeWebP(), but using the incremental decoder.
 VP8StatusCode ExUtilDecodeWebPIncremental(
    const uint8_t* const data, size_t data_size,
    int verbose, WebPDecoderConfig* const config);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -1,54 +1,96 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  simple tool to convert animated GIFs to WebP
 //
 // Getting the prerequisites:
 // Debian-like linux:
 //   sudo apt-get install libgif-dev
 // MacPorts
 //   sudo port install giflib
 //
 // Compiling:
 //   gcc -o gif2webp gif2webp.c -O3 -lwebpmux -lwebp -lgif -lpthread -lm
 //
 // Authors: Skal (pascal.massimino@gmail.com)
 //          Urvang (urvang@google.com)
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #ifdef WEBP_HAVE_GIF
 #include <gif_lib.h>
 #include "webp/encode.h"
 #include "webp/mux.h"
 #include "./example_util.h"
 #include "./gif2webp_util.h"
 // GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
 #if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
 # define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
 # define LOCAL_GIF_PREREQ(maj, min) \
    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
 #else
 # define LOCAL_GIF_VERSION 0
 # define LOCAL_GIF_PREREQ(maj, min) 0
 #endif
 #define GIF_TRANSPARENT_MASK 0x01
 #define GIF_DISPOSE_MASK     0x07
 #define GIF_DISPOSE_SHIFT    2
 #define TRANSPARENT_COLOR    0x00ffffff
 #define WHITE_COLOR          0xffffffff
 #define MAX_CACHE_SIZE       30
 //------------------------------------------------------------------------------
-static int transparent_index = -1;  // No transparency by default.
+static int transparent_index = -1;  // Opaque frame by default.
-static void ClearPicture(WebPPicture* const picture, uint32_t color) {
+static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
-  int x, y;
+                                      size_t* const kmax_ptr) {
-  for (y = 0; y < picture->height; ++y) {
+  size_t kmin = *kmin_ptr;
-    uint32_t* const dst = picture->argb + y * picture->argb_stride;
+  size_t kmax = *kmax_ptr;
-    for (x = 0; x < picture->width; ++x) dst[x] = color;
+  int print_warning = 1;
  if (kmin == 0) {  // Disable keyframe insertion.
    kmax = ~0;
    kmin = kmax - 1;
    print_warning = 0;
  }
  if (kmax == 0) {
    kmax = ~0;
    print_warning = 0;
  }
  if (kmin >= kmax) {
    kmin = kmax - 1;
    if (print_warning) {
      fprintf(stderr,
              "WARNING: Setting kmin = %d, so that kmin < kmax.\n", (int)kmin);
    }
  } else if (kmin < (kmax / 2 + 1)) {
    // This ensures that cache.keyframe + kmin >= kmax is always true. So, we
    // can flush all the frames in the ‘count_since_key_frame == kmax’ case.
    kmin = (kmax / 2 + 1);
    if (print_warning) {
      fprintf(stderr,
              "WARNING: Setting kmin = %d, so that kmin >= kmax / 2 + 1.\n",
              (int)kmin);
    }
  }
  // Limit the max number of frames that are allocated.
  if (kmax - kmin > MAX_CACHE_SIZE) {
    kmin = kmax - MAX_CACHE_SIZE;
    if (print_warning) {
      fprintf(stderr,
              "WARNING: Setting kmin = %d, so that kmax - kmin <= 30.\n",
              (int)kmin);
    }
  }
  *kmin_ptr = kmin;
  *kmax_ptr = kmax;
 }
 static void Remap(const uint8_t* const src, const GifFileType* const gif,
@ -62,85 +104,88 @@ static void Remap(const uint8_t* const src, const GifFileType* const gif,
  for (i = 0; i < len; ++i) {
    const GifColorType c = colors[src[i]];
-    dst[i] = (src[i] == transparent_index) ? TRANSPARENT_COLOR
+    dst[i] = (src[i] == transparent_index) ? WEBP_UTIL_TRANSPARENT_COLOR
           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
  }
 }
-static int ReadSubImage(GifFileType* gif, WebPPicture* pic, WebPPicture* view) {
+// Read the GIF image frame.
-  const GifImageDesc image_desc = gif->Image;
+static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
-  const int offset_x = image_desc.Left;
+                     WebPPicture* const webp_frame) {
-  const int offset_y = image_desc.Top;
+  WebPPicture sub_image;
-  const int sub_w = image_desc.Width;
+  const GifImageDesc* const image_desc = &gif->Image;
  const int sub_h = image_desc.Height;
  uint32_t* dst = NULL;
  uint8_t* tmp = NULL;
  int ok = 0;
  WebPFrameRect rect = {
      image_desc->Left, image_desc->Top, image_desc->Width, image_desc->Height
  };
  *gif_rect = rect;
  // Use a view for the sub-picture:
-  if (!WebPPictureView(pic, offset_x, offset_y, sub_w, sub_h, view)) {
+  if (!WebPPictureView(webp_frame, rect.x_offset, rect.y_offset,
                       rect.width, rect.height, &sub_image)) {
    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
-            sub_w, sub_h, offset_x, offset_y);
+            rect.width, rect.height, rect.x_offset, rect.y_offset);
-    goto End;
+    return 0;
  }
-  dst = view->argb;
+  dst = sub_image.argb;
-  tmp = (uint8_t*)malloc(sub_w * sizeof(*tmp));
+  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
  if (tmp == NULL) goto End;
-  if (image_desc.Interlace) {  // Interlaced image.
+  if (image_desc->Interlace) {  // Interlaced image.
    // We need 4 passes, with the following offsets and jumps.
    const int interlace_offsets[] = { 0, 4, 2, 1 };
    const int interlace_jumps[]   = { 8, 8, 4, 2 };
    int pass;
    for (pass = 0; pass < 4; ++pass) {
      int y;
-      for (y = interlace_offsets[pass]; y < sub_h; y += interlace_jumps[pass]) {
+      for (y = interlace_offsets[pass]; y < rect.height;
-        if (DGifGetLine(gif, tmp, sub_w) == GIF_ERROR) goto End;
+           y += interlace_jumps[pass]) {
-        Remap(tmp, gif, dst + y * view->argb_stride, sub_w);
+        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
        Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
      }
    }
  } else {  // Non-interlaced image.
    int y;
-    for (y = 0; y < sub_h; ++y) {
+    for (y = 0; y < rect.height; ++y) {
-      if (DGifGetLine(gif, tmp, sub_w) == GIF_ERROR) goto End;
+      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-      Remap(tmp, gif, dst + y * view->argb_stride, sub_w);
+      Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
    }
  }
  // re-align the view with even offset (and adjust dimensions if needed).
  WebPPictureView(pic, offset_x & ~1, offset_y & ~1,
                  sub_w + (offset_x & 1), sub_h + (offset_y & 1), view);
  ok = 1;
 End:
  if (!ok) webp_frame->error_code = sub_image.error_code;
  WebPPictureFree(&sub_image);
  free(tmp);
  return ok;
 }
-static int GetBackgroundColor(const ColorMapObject* const color_map,
+static void GetBackgroundColor(const ColorMapObject* const color_map,
-                              GifWord bgcolor_idx, uint32_t* const bgcolor) {
+                               int bgcolor_idx, uint32_t* const bgcolor) {
  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
-    *bgcolor = TRANSPARENT_COLOR;  // Special case.
+    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
    return 1;
  } else if (color_map == NULL || color_map->Colors == NULL
             || bgcolor_idx >= color_map->ColorCount) {
-    return 0;  // Invalid color map or index.
+    *bgcolor = WHITE_COLOR;
    fprintf(stderr,
            "GIF decode warning: invalid background color index. Assuming "
            "white background.\n");
  } else {
    const GifColorType color = color_map->Colors[bgcolor_idx];
    *bgcolor = (0xff        << 24)
             | (color.Red   << 16)
             | (color.Green <<  8)
             | (color.Blue  <<  0);
    return 1;
  }
 }
 static void DisplayGifError(const GifFileType* const gif, int gif_error) {
  // GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
-#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR) && \
+#if LOCAL_GIF_PREREQ(4,2)
-        ((GIFLIB_MAJOR == 4 && GIFLIB_MINOR >= 2) || GIFLIB_MAJOR > 4)
+#if LOCAL_GIF_PREREQ(5,0)
 #if GIFLIB_MAJOR >= 5
  // Static string actually, hence the const char* cast.
  const char* error_str = (const char*)GifErrorString(
      (gif == NULL) ? gif_error : gif->Error);
@ -158,7 +203,7 @@ static void DisplayGifError(const GifFileType* const gif, int gif_error) {
 #endif
 }
-static const char* const kErrorMessages[] = {
+static const char* const kErrorMessages[-WEBP_MUX_NOT_ENOUGH_DATA + 1] = {
  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
  "WEBP_MUX_MEMORY_ERROR", "WEBP_MUX_NOT_ENOUGH_DATA"
 };
@ -168,21 +213,37 @@ static const char* ErrorString(WebPMuxError err) {
  return kErrorMessages[-err];
 }
 enum {
  METADATA_ICC  = (1 << 0),
  METADATA_XMP  = (1 << 1),
  METADATA_ALL  = METADATA_ICC | METADATA_XMP
 };
 //------------------------------------------------------------------------------
 static void Help(void) {
  printf("Usage:\n");
  printf(" gif2webp [options] gif_file -o webp_file\n");
-  printf("options:\n");
+  printf("Options:\n");
  printf("  -h / -help  ............ this help\n");
-  printf("  -lossy ................. Encode image using lossy compression.\n");
+  printf("  -lossy ................. encode image using lossy compression\n");
  printf("  -mixed ................. for each frame in the image, pick lossy\n"
         "                           or lossless compression heuristically\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
  printf("  -kmin <int> ............ min distance between key frames\n");
  printf("  -kmax <int> ............ max distance between key frames\n");
  printf("  -f <int> ............... filter strength (0=off..100)\n");
  printf("  -metadata <string> ..... comma separated list of metadata to\n");
  printf("                           ");
  printf("copy from the input to the output if present\n");
  printf("                           "
         "Valid values: all, none, icc, xmp (default)\n");
  printf("  -mt .................... use multi-threading if available\n");
  printf("\n");
-  printf("  -version ............... print version number and exit.\n");
+  printf("  -version ............... print version number and exit\n");
-  printf("  -v ..................... verbose.\n");
+  printf("  -v ..................... verbose\n");
-  printf("  -quiet ................. don't print anything.\n");
+  printf("  -quiet ................. don't print anything\n");
  printf("\n");
 }
@ -196,25 +257,30 @@ int main(int argc, const char *argv[]) {
  const char *in_file = NULL, *out_file = NULL;
  FILE* out = NULL;
  GifFileType* gif = NULL;
-  WebPPicture picture;
+  WebPConfig config;
-  WebPPicture view;
+  WebPPicture frame;
-  WebPMemoryWriter memory;
+  int duration = 0;
-  WebPMuxFrameInfo frame;
+  FrameDisposeMethod orig_dispose = FRAME_DISPOSE_NONE;
  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
  WebPFrameCache* cache = NULL;
-  int is_first_frame = 1;
+  int is_first_frame = 1;     // Whether we are processing the first frame.
  int done;
  int c;
  int quiet = 0;
  WebPConfig config;
  WebPMux* mux = NULL;
  WebPData webp_data = { NULL, 0 };
  int keep_metadata = METADATA_XMP;  // ICC not output by default.
  int stored_icc = 0;  // Whether we have already stored an ICC profile.
  int stored_xmp = 0;
-  memset(&frame, 0, sizeof(frame));
+  int default_kmin = 1;  // Whether to use default kmin value.
-  frame.id = WEBP_CHUNK_ANMF;
+  int default_kmax = 1;
-  frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+  size_t kmin = 0;
  size_t kmax = 0;
  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.
-  if (!WebPConfigInit(&config) || !WebPPictureInit(&picture)) {
+  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    return -1;
  }
@ -226,6 +292,7 @@ int main(int argc, const char *argv[]) {
  }
  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
@ -233,12 +300,62 @@ int main(int argc, const char *argv[]) {
      out_file = argv[++c];
    } else if (!strcmp(argv[c], "-lossy")) {
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-mixed")) {
      allow_mixed = 1;
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
-      config.quality = (float)strtod(argv[++c], NULL);
+      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
-      config.method = strtol(argv[++c], NULL, 0);
+      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
      kmax = ExUtilGetUInt(argv[++c], 0, &parse_error);
      default_kmax = 0;
    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
      kmin = ExUtilGetUInt(argv[++c], 0, &parse_error);
      default_kmin = 0;
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
-      config.filter_strength = strtol(argv[++c], NULL, 0);
+      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-metadata") && c < argc - 1) {
      static const struct {
        const char* option;
        int flag;
      } kTokens[] = {
        { "all",  METADATA_ALL },
        { "none", 0 },
        { "icc",  METADATA_ICC },
        { "xmp",  METADATA_XMP },
      };
      const size_t kNumTokens = sizeof(kTokens) / sizeof(*kTokens);
      const char* start = argv[++c];
      const char* const end = start + strlen(start);
      keep_metadata = 0;
      while (start < end) {
        size_t i;
        const char* token = strchr(start, ',');
        if (token == NULL) token = end;
        for (i = 0; i < kNumTokens; ++i) {
          if ((size_t)(token - start) == strlen(kTokens[i].option) &&
              !strncmp(start, kTokens[i].option, strlen(kTokens[i].option))) {
            if (kTokens[i].flag != 0) {
              keep_metadata |= kTokens[i].flag;
            } else {
              keep_metadata = 0;
            }
            break;
          }
        }
        if (i == kNumTokens) {
          fprintf(stderr, "Error! Unknown metadata type '%.*s'\n",
                  (int)(token - start), start);
          Help();
          return -1;
        }
        start = token + 1;
      }
    } else if (!strcmp(argv[c], "-mt")) {
      ++config.thread_level;
    } else if (!strcmp(argv[c], "-version")) {
      const int enc_version = WebPGetEncoderVersion();
      const int mux_version = WebPGetMuxVersion();
@ -251,6 +368,9 @@ int main(int argc, const char *argv[]) {
      quiet = 1;
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
    } else if (!strcmp(argv[c], "--")) {
      if (c < argc - 1) in_file = argv[++c];
      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
      Help();
@ -258,7 +378,22 @@ int main(int argc, const char *argv[]) {
    } else {
      in_file = argv[c];
    }
    if (parse_error) {
      Help();
      return -1;
    }
  }
  // Appropriate default kmin, kmax values for lossy and lossless.
  if (default_kmin) {
    kmin = config.lossless ? 9 : 3;
  }
  if (default_kmax) {
    kmax = config.lossless ? 17 : 5;
  }
  SanitizeKeyFrameIntervals(&kmin, &kmax);
  if (!WebPValidateConfig(&config)) {
    fprintf(stderr, "Error! Invalid configuration.\n");
    goto End;
@ -271,22 +406,13 @@ int main(int argc, const char *argv[]) {
  }
  // Start the decoder object
-#if defined(GIFLIB_MAJOR) && (GIFLIB_MAJOR >= 5)
+#if LOCAL_GIF_PREREQ(5,0)
  // There was an API change in version 5.0.0.
  gif = DGifOpenFileName(in_file, &gif_error);
 #else
  gif = DGifOpenFileName(in_file);
 #endif
  if (gif == NULL) goto End;
  // Allocate picture buffer
  picture.width = gif->SWidth;
  picture.height = gif->SHeight;
  picture.use_argb = 1;
  picture.writer = WebPMemoryWrite;
  picture.custom_ptr = &memory;
  if (!WebPPictureAlloc(&picture)) goto End;
  mux = WebPMuxNew();
  if (mux == NULL) {
    fprintf(stderr, "ERROR: could not create a mux object.\n");
@ -301,51 +427,85 @@ int main(int argc, const char *argv[]) {
    switch (type) {
      case IMAGE_DESC_RECORD_TYPE: {
-        if (frame.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+        WebPFrameRect gif_rect;
-          ClearPicture(&picture, anim.bgcolor);
+        GifImageDesc* const image_desc = &gif->Image;
        }
        if (!DGifGetImageDesc(gif)) goto End;
        if (!ReadSubImage(gif, &picture, &view)) goto End;
-        WebPMemoryWriterInit(&memory);
+        // Fix some broken GIF global headers that report
-        if (!config.lossless) {
+        // 0 x 0 screen dimension.
-          // We need to call BGRA variant because of the way we do Remap().
+        if (is_first_frame) {
-          // TODO(later): This works for little-endian only due to uint32_t to
+          if (verbose) {
-          // uint8_t conversion. Make it work for big-endian too.
+            printf("Canvas screen: %d x %d\n", gif->SWidth, gif->SHeight);
          WebPPictureImportBGRA(&view, (uint8_t*)view.argb,
                                view.argb_stride * sizeof(*view.argb));
          view.use_argb = 0;
        } else {
          view.use_argb = 1;
          }
-        if (!WebPEncode(&config, &view)) {
+          if (gif->SWidth == 0 || gif->SHeight == 0) {
-          fprintf(stderr, "Error! Cannot encode picture as WebP\n");
+            image_desc->Left = 0;
-          fprintf(stderr, "Error code: %d\n", view.error_code);
+            image_desc->Top = 0;
            gif->SWidth = image_desc->Width;
            gif->SHeight = image_desc->Height;
            if (gif->SWidth <= 0 || gif->SHeight <= 0) {
              goto End;
            }
            if (verbose) {
              printf("Fixed canvas screen dimension to: %d x %d\n",
                     gif->SWidth, gif->SHeight);
            }
          }
 #if WEBP_MUX_ABI_VERSION > 0x0101
          // Set definitive canvas size.
          err = WebPMuxSetCanvasSize(mux, gif->SWidth, gif->SHeight);
          if (err != WEBP_MUX_OK) {
            fprintf(stderr, "Invalid canvas size %d x %d\n",
                    gif->SWidth, gif->SHeight);
            goto End;
          }
 #endif
          // Allocate current buffer.
          frame.width = gif->SWidth;
          frame.height = gif->SHeight;
          frame.use_argb = 1;
          if (!WebPPictureAlloc(&frame)) goto End;
          WebPUtilClearPic(&frame, NULL);
          // Initialize cache.
          cache = WebPFrameCacheNew(frame.width, frame.height,
                                    kmin, kmax, allow_mixed);
          if (cache == NULL) goto End;
          // Background color.
          GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
                             &anim.bgcolor);
        }
        // Some even more broken GIF can have sub-rect with zero width/height.
        if (image_desc->Width == 0 || image_desc->Height == 0) {
          image_desc->Width = gif->SWidth;
          image_desc->Height = gif->SHeight;
        }
        if (!ReadFrame(gif, &gif_rect, &frame)) {
          goto End;
        }
-        // Now we have all the info about the frame, as a Graphic Control
+        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, orig_dispose,
-        // Extension Block always appears before the Image Descriptor Block.
+                                    duration, &frame)) {
-        // So add the frame to mux.
+          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
-        frame.x_offset = gif->Image.Left & ~1;
+          fprintf(stderr, "Error code: %d\n", frame.error_code);
-        frame.y_offset = gif->Image.Top & ~1;
+        }
-        frame.bitstream.bytes = memory.mem;
+
-        frame.bitstream.size = memory.size;
+        err = WebPFrameCacheFlush(cache, verbose, mux);
        err = WebPMuxPushFrame(mux, &frame, 1);
        if (err != WEBP_MUX_OK) {
          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
                  ErrorString(err));
          goto End;
        }
-        if (verbose) {
+        is_first_frame = 0;
-          printf("Added frame %dx%d (offset:%d,%d duration:%d) ",
+
-                 view.width, view.height, frame.x_offset, frame.y_offset,
+        // In GIF, graphic control extensions are optional for a frame, so we
-                 frame.duration);
+        // may not get one before reading the next frame. To handle this case,
-          printf("dispose:%d transparent index:%d\n",
+        // we reset frame properties to reasonable defaults for the next frame.
-                 frame.dispose_method, transparent_index);
+        orig_dispose = FRAME_DISPOSE_NONE;
-        }
+        duration = 0;
-        WebPDataClear(&frame.bitstream);
+        transparent_index = -1;  // Opaque frame by default.
        break;
      }
      case EXTENSION_RECORD_TYPE: {
@ -363,27 +523,21 @@ int main(int argc, const char *argv[]) {
            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
            if (data[0] != 4) goto End;
-            frame.duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            duration = delay * 10;  // Duration is in 1 ms units for WebP.
-            if (dispose == 3) {
+            switch (dispose) {
-              fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE not supported.");
+              case 3:
-              // failsafe. TODO(urvang): emulate the correct behaviour by
+                orig_dispose = FRAME_DISPOSE_RESTORE_PREVIOUS;
-              // recoding the whole frame.
+                break;
-              frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+              case 2:
-            } else {
+                orig_dispose = FRAME_DISPOSE_BACKGROUND;
-              frame.dispose_method =
+                break;
-                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
+              case 1:
-                                 : WEBP_MUX_DISPOSE_NONE;
+              case 0:
              default:
                orig_dispose = FRAME_DISPOSE_NONE;
                break;
            }
            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
            if (is_first_frame) {
              if (!GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
                                      &anim.bgcolor)) {
                fprintf(stderr, "GIF decode warning: invalid background color "
                        "index. Assuming white background.\n");
              }
              ClearPicture(&picture, anim.bgcolor);
              is_first_frame = 0;
            }
            break;
          }
          case PLAINTEXT_EXT_FUNC_CODE: {
@ -391,31 +545,84 @@ int main(int argc, const char *argv[]) {
          }
          case APPLICATION_EXT_FUNC_CODE: {
            if (data[0] != 11) break;    // Chunk is too short
-            if (!memcmp(data + 1, "NETSCAPE2.0", 11)) {
+            if (!memcmp(data + 1, "NETSCAPE2.0", 11) ||
                !memcmp(data + 1, "ANIMEXTS1.0", 11)) {
              // Recognize and parse Netscape2.0 NAB extension for loop count.
              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
              if (data == NULL) goto End;  // Loop count sub-block missing.
-              if (data[0] != 3 && data[1] != 1) break;   // wrong size/marker
+              if (data[0] < 3 || data[1] != 1) break;   // wrong size/marker
              anim.loop_count = data[2] | (data[3] << 8);
-              if (verbose) printf("Loop count: %d\n", anim.loop_count);
+              if (verbose) {
-            } else if (!memcmp(data + 1, "XMP dataXMP", 11)) {
+                fprintf(stderr, "Loop count: %d\n", anim.loop_count);
-              // Read XMP metadata.
+              }
-              WebPData xmp;
+            } else {  // An extension containing metadata.
-              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+              // We only store the first encountered chunk of each type, and
-              if (data == NULL) goto End;
+              // only if requested by the user.
-              xmp.bytes = (uint8_t*)data;
+              const int is_xmp = (keep_metadata & METADATA_XMP) &&
-              xmp.size = data[0] + 1;
+                                 !stored_xmp &&
-              WebPMuxSetChunk(mux, "XMP ", &xmp, 1);
+                                 !memcmp(data + 1, "XMP DataXMP", 11);
-              if (verbose) printf("XMP size: %d\n", (int)xmp.size);
+              const int is_icc = (keep_metadata & METADATA_ICC) &&
-            } else if (!memcmp(data + 1, "ICCRGBG1012", 11)) {
+                                 !stored_icc &&
-              // Read ICC profile.
+                                 !memcmp(data + 1, "ICCRGBG1012", 11);
-              WebPData icc;
+              if (is_xmp || is_icc) {
-              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+                const char* const fourccs[2] = { "XMP " , "ICCP" };
-              if (data == NULL) goto End;
+                const char* const features[2] = { "XMP" , "ICC" };
-              icc.bytes = (uint8_t*)data;
+                WebPData metadata = { NULL, 0 };
-              icc.size = data[0] + 1;
+                // Construct metadata from sub-blocks.
-              WebPMuxSetChunk(mux, "ICCP", &icc, 1);
+                // Usual case (including ICC profile): In each sub-block, the
-              if (verbose) printf("ICC size: %d\n", (int)icc.size);
+                // first byte specifies its size in bytes (0 to 255) and the
                // rest of the bytes contain the data.
                // Special case for XMP data: In each sub-block, the first byte
                // is also part of the XMP payload. XMP in GIF also has a 257
                // byte padding data. See the XMP specification for details.
                while (1) {
                  WebPData prev_metadata = metadata;
                  WebPData subblock;
                  if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) {
                    WebPDataClear(&metadata);
                    goto End;
                  }
                  if (data == NULL) break;  // Finished.
                  subblock.size = is_xmp ? data[0] + 1 : data[0];
                  assert(subblock.size > 0);
                  subblock.bytes = is_xmp ? data : data + 1;
                  metadata.bytes =
                      (uint8_t*)realloc((void*)metadata.bytes,
                                        prev_metadata.size + subblock.size);
                  if (metadata.bytes == NULL) {
                    WebPDataClear(&prev_metadata);
                    goto End;
                  }
                  metadata.size += subblock.size;
                  memcpy((void*)(metadata.bytes + prev_metadata.size),
                         subblock.bytes, subblock.size);
                }
                if (is_xmp) {
                  // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
                  const size_t xmp_pading_size = 257;
                  if (metadata.size > xmp_pading_size) {
                    metadata.size -= xmp_pading_size;
                  }
                }
                // Add metadata chunk.
                err = WebPMuxSetChunk(mux, fourccs[is_icc], &metadata, 1);
                if (verbose) {
                  fprintf(stderr, "%s size: %d\n",
                          features[is_icc], (int)metadata.size);
                }
                WebPDataClear(&metadata);
                if (err != WEBP_MUX_OK) {
                  fprintf(stderr, "ERROR (%s): Could not set %s chunk.\n",
                          ErrorString(err), features[is_icc]);
                  goto End;
                }
                if (is_icc) {
                  stored_icc = 1;
                } else if (is_xmp) {
                  stored_xmp = 1;
                }
              }
            }
            break;
          }
@ -423,9 +630,9 @@ int main(int argc, const char *argv[]) {
            break;  // skip
          }
        }
-        do {
+        while (data != NULL) {
          if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
-        } while (data != NULL);
+        }
        break;
      }
      case TERMINATE_RECORD_TYPE: {
@ -441,6 +648,14 @@ int main(int argc, const char *argv[]) {
    }
  } while (!done);
  // Flush any pending frames.
  err = WebPFrameCacheFlushAll(cache, verbose, mux);
  if (err != WEBP_MUX_OK) {
    fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
            ErrorString(err));
    goto End;
  }
  // Finish muxing
  err = WebPMuxSetAnimationParams(mux, &anim);
  if (err != WEBP_MUX_OK) {
@ -460,11 +675,11 @@ int main(int argc, const char *argv[]) {
      goto End;
    }
    if (!quiet) {
-      printf("Saved output file: %s\n", out_file);
+      fprintf(stderr, "Saved output file: %s\n", out_file);
    }
  } else {
    if (!quiet) {
-      printf("Nothing written; use -o flag to save the result.\n");
+      fprintf(stderr, "Nothing written; use -o flag to save the result.\n");
    }
  }
@ -475,17 +690,32 @@ int main(int argc, const char *argv[]) {
 End:
  WebPDataClear(&webp_data);
  WebPMuxDelete(mux);
-  WebPPictureFree(&picture);
+  WebPPictureFree(&frame);
  WebPFrameCacheDelete(cache);
  if (out != NULL && out_file != NULL) fclose(out);
  if (gif_error != GIF_OK) {
    DisplayGifError(gif, gif_error);
  }
  if (gif != NULL) {
 #if LOCAL_GIF_PREREQ(5,1)
    DGifCloseFile(gif, &gif_error);
 #else
    DGifCloseFile(gif);
 #endif
  }
  return !ok;
 }
 #else  // !WEBP_HAVE_GIF
 int main(int argc, const char *argv[]) {
  fprintf(stderr, "GIF support not enabled in %s.\n", argv[0]);
  (void)argc;
  return 0;
 }
 #endif
 //------------------------------------------------------------------------------
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -0,0 +1,88 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Helper structs and methods for gif2webp tool.
 //
 // Author: Urvang (urvang@google.com)
 #ifndef WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
 #define WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
 #include <stdlib.h>
 #include "webp/mux.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Helper utilities.
 #define WEBP_UTIL_TRANSPARENT_COLOR 0x00ffffff
 struct WebPPicture;
 // Includes all disposal methods, even the ones not supported by WebP bitstream.
 typedef enum FrameDisposeMethod {
  FRAME_DISPOSE_NONE,
  FRAME_DISPOSE_BACKGROUND,
  FRAME_DISPOSE_RESTORE_PREVIOUS
 } FrameDisposeMethod;
 typedef struct {
  int x_offset, y_offset, width, height;
 } WebPFrameRect;
 // Clear pixels in 'picture' within given 'rect' to transparent color.
 void WebPUtilClearPic(struct WebPPicture* const picture,
                      const WebPFrameRect* const rect);
 //------------------------------------------------------------------------------
 // Frame cache.
 typedef struct WebPFrameCache WebPFrameCache;
 // Given the minimum distance between key frames 'kmin' and maximum distance
 // between key frames 'kmax', returns an appropriately allocated cache object.
 // If 'allow_mixed' is true, the subsequent calls to WebPFrameCacheAddFrame()
 // will heuristically pick lossy or lossless compression for each frame.
 // Use WebPFrameCacheDelete() to deallocate the 'cache'.
 WebPFrameCache* WebPFrameCacheNew(int width, int height,
                                  size_t kmin, size_t kmax, int allow_mixed);
 // Release all the frame data from 'cache' and free 'cache'.
 void WebPFrameCacheDelete(WebPFrameCache* const cache);
 // Given an image described by 'frame', 'rect', 'dispose_method' and 'duration',
 // optimize it for WebP, encode it and add it to 'cache'. 'rect' can be NULL.
 // This takes care of frame disposal too, according to 'dispose_method'.
 // Returns false in case of error (and sets frame->error_code accordingly).
 int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
                           const WebPConfig* const config,
                           const WebPFrameRect* const rect,
                           FrameDisposeMethod dispose_method, int duration,
                           WebPPicture* const frame);
 // Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
 // true, prints the information about these frames.
 WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
                                 WebPMux* const mux);
 // Similar to 'WebPFrameCacheFlushFrames()', but flushes *all* the frames.
 WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
                                    WebPMux* const mux);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // JPEG decode.
@ -10,7 +12,7 @@
 #include "./jpegdec.h"
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #include <stdio.h>
@ -209,31 +211,31 @@ static void my_error_exit(j_common_ptr dinfo) {
 int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  int ok = 0;
  int stride, width, height;
-  struct jpeg_decompress_struct dinfo;
+  volatile struct jpeg_decompress_struct dinfo;
  struct my_error_mgr jerr;
-  uint8_t* rgb = NULL;
+  uint8_t* volatile rgb = NULL;
  JSAMPROW buffer[1];
  memset((j_decompress_ptr)&dinfo, 0, sizeof(dinfo));   // for setjmp sanity
  dinfo.err = jpeg_std_error(&jerr.pub);
  jerr.pub.error_exit = my_error_exit;
  if (setjmp(jerr.setjmp_buffer)) {
 Error:
    MetadataFree(metadata);
-    jpeg_destroy_decompress(&dinfo);
+    jpeg_destroy_decompress((j_decompress_ptr)&dinfo);
    goto End;
  }
-  jpeg_create_decompress(&dinfo);
+  jpeg_create_decompress((j_decompress_ptr)&dinfo);
-  jpeg_stdio_src(&dinfo, in_file);
+  jpeg_stdio_src((j_decompress_ptr)&dinfo, in_file);
-  if (metadata != NULL) SaveMetadataMarkers(&dinfo);
+  if (metadata != NULL) SaveMetadataMarkers((j_decompress_ptr)&dinfo);
-  jpeg_read_header(&dinfo, TRUE);
+  jpeg_read_header((j_decompress_ptr)&dinfo, TRUE);
  dinfo.out_color_space = JCS_RGB;
  dinfo.dct_method = JDCT_IFAST;
  dinfo.do_fancy_upsampling = TRUE;
-  jpeg_start_decompress(&dinfo);
+  jpeg_start_decompress((j_decompress_ptr)&dinfo);
  if (dinfo.output_components != 3) {
    goto Error;
@ -250,26 +252,27 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  buffer[0] = (JSAMPLE*)rgb;
  while (dinfo.output_scanline < dinfo.output_height) {
-    if (jpeg_read_scanlines(&dinfo, buffer, 1) != 1) {
+    if (jpeg_read_scanlines((j_decompress_ptr)&dinfo, buffer, 1) != 1) {
      goto End;
    }
    buffer[0] += stride;
  }
  if (metadata != NULL) {
-    ok = ExtractMetadataFromJPEG(&dinfo, metadata);
+    ok = ExtractMetadataFromJPEG((j_decompress_ptr)&dinfo, metadata);
    if (!ok) {
      fprintf(stderr, "Error extracting JPEG metadata!\n");
      goto Error;
    }
  }
-  jpeg_finish_decompress(&dinfo);
+  jpeg_finish_decompress((j_decompress_ptr)&dinfo);
-  jpeg_destroy_decompress(&dinfo);
+  jpeg_destroy_decompress((j_decompress_ptr)&dinfo);
  // WebP conversion.
  pic->width = width;
  pic->height = height;
  pic->use_argb = 1;      // store raw RGB samples
  ok = WebPPictureImportRGB(pic, rgb, stride);
  if (!ok) goto Error;
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // JPEG decode.
@ -13,7 +15,7 @@
 #include <stdio.h>
 #include "webp/types.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -26,7 +28,7 @@ struct WebPPicture;
 int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
             struct Metadata* const metadata);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/examples/metadata.c
+++ b/examples/metadata.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Metadata types and functions.
--- a/examples/metadata.h
+++ b/examples/metadata.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Metadata types and functions.
@ -13,7 +15,7 @@
 #include "webp/types.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -38,7 +40,7 @@ void MetadataFree(Metadata* const metadata);
 int MetadataCopy(const char* metadata, size_t metadata_len,
                 MetadataPayload* const payload);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // PNG decode.
@ -10,7 +12,7 @@
 #include "./pngdec.h"
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #include <stdio.h>
@ -19,12 +21,13 @@
 #include <png.h>
 #include <setjmp.h>   // note: this must be included *after* png.h
 #include <stdlib.h>
 #include <string.h>
 #include "webp/encode.h"
 #include "./metadata.h"
-static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
+static void PNGAPI error_function(png_structp png, png_const_charp error) {
-  (void)dummy;  // remove variable-unused warning
+  if (error != NULL) fprintf(stderr, "libpng error: %s\n", error);
  longjmp(png_jmpbuf(png), 1);
 }
@ -108,6 +111,8 @@ static const struct {
  // See also: ExifTool on CPAN.
  { "Raw profile type exif", ProcessRawProfile, METADATA_OFFSET(exif) },
  { "Raw profile type xmp",  ProcessRawProfile, METADATA_OFFSET(xmp) },
  // Exiftool puts exif data in APP1 chunk, too.
  { "Raw profile type APP1", ProcessRawProfile, METADATA_OFFSET(exif) },
  // XMP Specification Part 3, Section 3 #PNG
  { "XML:com.adobe.xmp",     MetadataCopy,      METADATA_OFFSET(xmp) },
  { NULL, NULL, 0 },
@ -185,9 +190,9 @@ static int ExtractMetadataFromPNG(png_structp png,
 int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
            Metadata* const metadata) {
-  png_structp png;
+  volatile png_structp png;
-  png_infop info = NULL;
+  volatile png_infop info = NULL;
-  png_infop end_info = NULL;
+  volatile png_infop end_info = NULL;
  int color_type, bit_depth, interlaced;
  int has_alpha;
  int num_passes;
@ -195,7 +200,7 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  int ok = 0;
  png_uint_32 width, height, y;
  int stride;
-  uint8_t* rgb = NULL;
+  uint8_t* volatile rgb = NULL;
  png = png_create_read_struct(PNG_LIBPNG_VER_STRING, 0, 0, 0);
  if (png == NULL) {
@ -206,8 +211,6 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  if (setjmp(png_jmpbuf(png))) {
 Error:
    MetadataFree(metadata);
    png_destroy_read_struct(&png, &info, &end_info);
    free(rgb);
    goto End;
  }
@ -224,7 +227,9 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  png_set_strip_16(png);
  png_set_packing(png);
-  if (color_type == PNG_COLOR_TYPE_PALETTE) png_set_palette_to_rgb(png);
+  if (color_type == PNG_COLOR_TYPE_PALETTE) {
    png_set_palette_to_rgb(png);
  }
  if (color_type == PNG_COLOR_TYPE_GRAY ||
      color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
    if (bit_depth < 8) {
@ -251,7 +256,7 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  if (rgb == NULL) goto Error;
  for (p = 0; p < num_passes; ++p) {
    for (y = 0; y < height; ++y) {
-      png_bytep row = rgb + y * stride;
+      png_bytep row = (png_bytep)(rgb + y * stride);
      png_read_rows(png, &row, NULL, 1);
    }
  }
@ -263,19 +268,22 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
    goto Error;
  }
  png_destroy_read_struct(&png, &info, &end_info);
  pic->width = width;
  pic->height = height;
  pic->use_argb = 1;
  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
                 : WebPPictureImportRGB(pic, rgb, stride);
  free(rgb);
  if (!ok) {
    goto Error;
  }
 End:
  if (png != NULL) {
    png_destroy_read_struct((png_structpp)&png,
                            (png_infopp)&info, (png_infopp)&end_info);
  }
  free(rgb);
  return ok;
 }
 #else  // !WEBP_HAVE_PNG
--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // PNG decode.
@ -12,7 +14,7 @@
 #include <stdio.h>
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -26,7 +28,7 @@ struct WebPPicture;
 int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
            struct Metadata* const metadata);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Helper functions to measure elapsed time.
@ -12,11 +14,17 @@
 #ifndef WEBP_EXAMPLES_STOPWATCH_H_
 #define WEBP_EXAMPLES_STOPWATCH_H_
 #include "webp/types.h"
 #if defined _WIN32 && !defined __GNUC__
 #include <windows.h>
 typedef LARGE_INTEGER Stopwatch;
 static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
  QueryPerformanceCounter(watch);
 }
 static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const LARGE_INTEGER old_value = *watch;
  LARGE_INTEGER freq;
@ -31,15 +39,23 @@ static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
 #else    /* !_WIN32 */
 #include <string.h>  // memcpy
 #include <sys/time.h>
 typedef struct timeval Stopwatch;
-static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
+static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
  const struct timeval old_value = *watch;
  gettimeofday(watch, NULL);
-  return watch->tv_sec - old_value.tv_sec +
+}
-      (watch->tv_usec - old_value.tv_usec) / 1000000.0;
+
 static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  struct timeval old_value;
  double delta_sec, delta_usec;
  memcpy(&old_value, watch, sizeof(old_value));
  gettimeofday(watch, NULL);
  delta_sec = (double)watch->tv_sec - old_value.tv_sec;
  delta_usec = (double)watch->tv_usec - old_value.tv_usec;
  return delta_sec + delta_usec / 1000000.0;
 }
 #endif   /* _WIN32 */
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // TIFF decode.
@ -10,7 +12,7 @@
 #include "./tiffdec.h"
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #include <stdio.h>
@ -95,9 +97,10 @@ int ReadTIFF(const char* const filename,
      pic->width = width;
      pic->height = height;
      // TIFF data is ABGR
-#ifdef __BIG_ENDIAN__
+#ifdef WORDS_BIGENDIAN
      TIFFSwabArrayOfLong(raster, width * height);
 #endif
      pic->use_argb = 1;
      ok = keep_alpha
         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // TIFF decode.
@ -10,7 +12,7 @@
 #ifndef WEBP_EXAMPLES_TIFFDEC_H_
 #define WEBP_EXAMPLES_TIFFDEC_H_
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -25,7 +27,7 @@ int ReadTIFF(const char* const filename,
             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -1,25 +1,26 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Simple WebP file viewer.
+//  Simple OpenGL-based WebP file viewer.
 //
 // Compiling on linux:
 //   sudo apt-get install freeglut3-dev mesa-common-dev
 //   gcc -o vwebp vwebp.c -O3 -lwebp -lwebpmux -lglut -lGL -lpthread -lm
 // Compiling on Mac + XCode:
 //   gcc -o vwebp vwebp.c -lwebp -lwebpmux -framework GLUT -framework OpenGL
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifdef HAVE_CONFIG_H
 #include "webp/config.h"
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifdef __APPLE__
+#if defined(WEBP_HAVE_GL)
 #if defined(HAVE_GLUT_GLUT_H)
 #include <GLUT/glut.h>
 #else
 #include <GL/glut.h>
@ -41,8 +42,6 @@
 #define snprintf _snprintf
 #endif
 static void Help(void);
 // Unfortunate global variables. Gathered into a struct for comfort.
 static struct {
  int has_animation;
@ -58,15 +57,11 @@ static struct {
  const char* file_name;
  WebPData data;
-  WebPDecoderConfig* config;
+  WebPDecoderConfig config;
  const WebPDecBuffer* pic;
  WebPDemuxer* dmux;
-  WebPIterator frameiter;
+  WebPIterator curr_frame;
-  struct {
+  WebPIterator prev_frame;
    int width, height;
    int x_offset, y_offset;
    enum WebPMuxAnimDispose dispose_method;
  } prev_frame;
  WebPChunkIterator iccp;
 } kParams;
@ -78,12 +73,23 @@ static void ClearPreviousPic(void) {
 static void ClearParams(void) {
  ClearPreviousPic();
  WebPDataClear(&kParams.data);
-  WebPDemuxReleaseIterator(&kParams.frameiter);
+  WebPDemuxReleaseIterator(&kParams.curr_frame);
  WebPDemuxReleaseIterator(&kParams.prev_frame);
  WebPDemuxReleaseChunkIterator(&kParams.iccp);
  WebPDemuxDelete(kParams.dmux);
  kParams.dmux = NULL;
 }
 // Sets the previous frame to the dimensions of the canvas and has it dispose
 // to background to cause the canvas to be cleared.
 static void ClearPreviousFrame(void) {
  WebPIterator* const prev = &kParams.prev_frame;
  prev->width = kParams.canvas_width;
  prev->height = kParams.canvas_height;
  prev->x_offset = prev->y_offset = 0;
  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
 }
 // -----------------------------------------------------------------------------
 // Color profile handling
 static int ApplyColorProfile(const WebPData* const profile,
@ -148,25 +154,25 @@ static int ApplyColorProfile(const WebPData* const profile,
 //------------------------------------------------------------------------------
 // File decoding
-static int Decode(void) {   // Fills kParams.frameiter
+static int Decode(void) {   // Fills kParams.curr_frame
-  const WebPIterator* const iter = &kParams.frameiter;
+  const WebPIterator* const curr = &kParams.curr_frame;
-  WebPDecoderConfig* const config = kParams.config;
+  WebPDecoderConfig* const config = &kParams.config;
  WebPDecBuffer* const output_buffer = &config->output;
  int ok = 0;
  ClearPreviousPic();
  output_buffer->colorspace = MODE_RGBA;
-  ok = (WebPDecode(iter->fragment.bytes, iter->fragment.size,
+  ok = (WebPDecode(curr->fragment.bytes, curr->fragment.size,
                   config) == VP8_STATUS_OK);
  if (!ok) {
-    fprintf(stderr, "Decoding of frame #%d failed!\n", iter->frame_num);
+    fprintf(stderr, "Decoding of frame #%d failed!\n", curr->frame_num);
  } else {
    kParams.pic = output_buffer;
    if (kParams.use_color_profile) {
      ok = ApplyColorProfile(&kParams.iccp.chunk, output_buffer);
      if (!ok) {
        fprintf(stderr, "Applying color profile to frame #%d failed!\n",
-                iter->frame_num);
+                curr->frame_num);
      }
    }
  }
@ -177,19 +183,21 @@ static void decode_callback(int what) {
  if (what == 0 && !kParams.done) {
    int duration = 0;
    if (kParams.dmux != NULL) {
-      WebPIterator* const iter = &kParams.frameiter;
+      WebPIterator* const curr = &kParams.curr_frame;
-      if (!WebPDemuxNextFrame(iter)) {
+      if (!WebPDemuxNextFrame(curr)) {
-        WebPDemuxReleaseIterator(iter);
+        WebPDemuxReleaseIterator(curr);
-        if (WebPDemuxGetFrame(kParams.dmux, 1, iter)) {
+        if (WebPDemuxGetFrame(kParams.dmux, 1, curr)) {
          --kParams.loop_count;
          kParams.done = (kParams.loop_count == 0);
          if (kParams.done) return;
          ClearPreviousFrame();
        } else {
          kParams.decoding_error = 1;
          kParams.done = 1;
          return;
        }
      }
-      duration = iter->duration;
+      duration = curr->duration;
    }
    if (!Decode()) {
      kParams.decoding_error = 1;
@ -282,40 +290,45 @@ static void DrawCheckerBoard(void) {
 static void HandleDisplay(void) {
  const WebPDecBuffer* const pic = kParams.pic;
-  const WebPIterator* const iter = &kParams.frameiter;
+  const WebPIterator* const curr = &kParams.curr_frame;
  WebPIterator* const prev = &kParams.prev_frame;
  GLfloat xoff, yoff;
  if (pic == NULL) return;
  glPushMatrix();
  glPixelZoom(1, -1);
-  xoff = (GLfloat)(2. * iter->x_offset / kParams.canvas_width);
+  xoff = (GLfloat)(2. * curr->x_offset / kParams.canvas_width);
-  yoff = (GLfloat)(2. * iter->y_offset / kParams.canvas_height);
+  yoff = (GLfloat)(2. * curr->y_offset / kParams.canvas_height);
  glRasterPos2f(-1.f + xoff, 1.f - yoff);
  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
  glPixelStorei(GL_UNPACK_ROW_LENGTH, pic->u.RGBA.stride / 4);
-  if (kParams.prev_frame.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+  if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ||
      curr->blend_method == WEBP_MUX_NO_BLEND) {
    // TODO(later): these offsets and those above should factor in window size.
    //              they will be incorrect if the window is resized.
    // glScissor() takes window coordinates (0,0 at bottom left).
-    const int window_x = kParams.prev_frame.x_offset;
+    int window_x, window_y;
-    const int window_y = kParams.canvas_height -
+    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
-                         kParams.prev_frame.y_offset -
+      // Clear the previous frame rectangle.
-                         kParams.prev_frame.height;
+      window_x = prev->x_offset;
      window_y = kParams.canvas_height - prev->y_offset - prev->height;
    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
      // We simulate no-blending behavior by first clearing the current frame
      // rectangle (to a checker-board) and then alpha-blending against it.
      window_x = curr->x_offset;
      window_y = kParams.canvas_height - curr->y_offset - curr->height;
    }
    glEnable(GL_SCISSOR_TEST);
-    // Only updated the requested area, not the whole canvas.
+    // Only update the requested area, not the whole canvas.
-    glScissor(window_x, window_y,
+    glScissor(window_x, window_y, prev->width, prev->height);
              kParams.prev_frame.width, kParams.prev_frame.height);
    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();
    glDisable(GL_SCISSOR_TEST);
  }
-  kParams.prev_frame.width = iter->width;
+
-  kParams.prev_frame.height = iter->height;
+  *prev = *curr;
  kParams.prev_frame.x_offset = iter->x_offset;
  kParams.prev_frame.y_offset = iter->y_offset;
  kParams.prev_frame.dispose_method = iter->dispose_method;
  glDrawPixels(pic->width, pic->height,
               GL_RGBA, GL_UNSIGNED_BYTE,
@ -331,9 +344,9 @@ static void HandleDisplay(void) {
    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
    glRasterPos2f(-0.95f, 0.80f);
    PrintString(tmp);
-    if (iter->x_offset != 0 || iter->y_offset != 0) {
+    if (curr->x_offset != 0 || curr->y_offset != 0) {
      snprintf(tmp, sizeof(tmp), " (offset:%d,%d)",
-               iter->x_offset, iter->y_offset);
+               curr->x_offset, curr->y_offset);
      glRasterPos2f(-0.95f, 0.70f);
      PrintString(tmp);
    }
@ -369,42 +382,58 @@ static void Help(void) {
  printf("Usage: vwebp in_file [options]\n\n"
         "Decodes the WebP image file and visualize it using OpenGL\n"
         "Options are:\n"
-         "  -version  .... print version number and exit.\n"
+         "  -version  .... print version number and exit\n"
-         "  -noicc ....... don't use the icc profile if present.\n"
+         "  -noicc ....... don't use the icc profile if present\n"
-         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
-         "  -nofilter .... disable in-loop filtering.\n"
+         "  -nofilter .... disable in-loop filtering\n"
-         "  -mt .......... use multi-threading.\n"
+         "  -dither <int>  dithering strength (0..100), default=50\n"
-         "  -info ........ print info.\n"
+#if WEBP_DECODER_ABI_VERSION > 0x0204
-         "  -h     ....... this help message.\n"
+         "  -noalphadither disable alpha plane dithering\n"
 #endif
         "  -mt .......... use multi-threading\n"
         "  -info ........ print info\n"
         "  -h     ....... this help message\n"
         "\n"
         "Keyboard shortcuts:\n"
-         "  'c' ................ toggle use of color profile.\n"
+         "  'c' ................ toggle use of color profile\n"
-         "  'i' ................ overlay file information.\n"
+         "  'i' ................ overlay file information\n"
-         "  'q' / 'Q' / ESC .... quit.\n"
+         "  'q' / 'Q' / ESC .... quit\n"
        );
 }
 int main(int argc, char *argv[]) {
  WebPDecoderConfig config;
  int c;
  WebPDecoderConfig* const config = &kParams.config;
  WebPIterator* const curr = &kParams.curr_frame;
-  if (!WebPInitDecoderConfig(&config)) {
+  if (!WebPInitDecoderConfig(config)) {
    fprintf(stderr, "Library version mismatch!\n");
    return -1;
  }
-  kParams.config = &config;
+  config->options.dithering_strength = 50;
 #if WEBP_DECODER_ABI_VERSION > 0x0204
  config->options.alpha_dithering_strength = 100;
 #endif
  kParams.use_color_profile = 1;
  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
    } else if (!strcmp(argv[c], "-noicc")) {
      kParams.use_color_profile = 0;
    } else if (!strcmp(argv[c], "-nofancy")) {
-      config.options.no_fancy_upsampling = 1;
+      config->options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
-      config.options.bypass_filtering = 1;
+      config->options.bypass_filtering = 1;
 #if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-noalphadither")) {
      config->options.alpha_dithering_strength = 0;
 #endif
    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
      config->options.dithering_strength =
          ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-info")) {
      kParams.print_info = 1;
    } else if (!strcmp(argv[c], "-version")) {
@ -416,7 +445,10 @@ int main(int argc, char *argv[]) {
             (dmux_version >> 8) & 0xff, dmux_version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-mt")) {
-      config.options.use_threads = 1;
+      config->options.use_threads = 1;
    } else if (!strcmp(argv[c], "--")) {
      if (c < argc - 1) kParams.file_name = argv[++c];
      break;
    } else if (argv[c][0] == '-') {
      printf("Unknown option '%s'\n", argv[c]);
      Help();
@ -424,6 +456,11 @@ int main(int argc, char *argv[]) {
    } else {
      kParams.file_name = argv[c];
    }
    if (parse_error) {
      Help();
      return -1;
    }
  }
  if (kParams.file_name == NULL) {
@ -437,6 +474,11 @@ int main(int argc, char *argv[]) {
    goto Error;
  }
  if (!WebPGetInfo(kParams.data.bytes, kParams.data.size, NULL, NULL)) {
    fprintf(stderr, "Input file doesn't appear to be WebP format.\n");
    goto Error;
  }
  kParams.dmux = WebPDemux(&kParams.data);
  if (kParams.dmux == NULL) {
    fprintf(stderr, "Could not create demuxing object!\n");
@ -453,10 +495,7 @@ int main(int argc, char *argv[]) {
    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
  }
-  kParams.prev_frame.width = kParams.canvas_width;
+  ClearPreviousFrame();
  kParams.prev_frame.height = kParams.canvas_height;
  kParams.prev_frame.x_offset = kParams.prev_frame.y_offset = 0;
  kParams.prev_frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
  kParams.has_color_profile =
@ -472,20 +511,20 @@ int main(int argc, char *argv[]) {
 #endif
  }
-  if (!WebPDemuxGetFrame(kParams.dmux, 1, &kParams.frameiter)) goto Error;
+  if (!WebPDemuxGetFrame(kParams.dmux, 1, curr)) goto Error;
-  kParams.has_animation = (kParams.frameiter.num_frames > 1);
+  kParams.has_animation = (curr->num_frames > 1);
  kParams.loop_count = (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT);
  kParams.bg_color = WebPDemuxGetI(kParams.dmux, WEBP_FF_BACKGROUND_COLOR);
  printf("VP8X: Found %d images in file (loop count = %d)\n",
-         kParams.frameiter.num_frames, kParams.loop_count);
+         curr->num_frames, kParams.loop_count);
  // Decode first frame
  if (!Decode()) goto Error;
  // Position iterator to last frame. Next call to HandleDisplay will wrap over.
  // We take this into account by bumping up loop_count.
-  WebPDemuxGetFrame(kParams.dmux, 0, &kParams.frameiter);
+  WebPDemuxGetFrame(kParams.dmux, 0, curr);
  if (kParams.loop_count) ++kParams.loop_count;
  // Start display (and timer)
@ -507,4 +546,14 @@ int main(int argc, char *argv[]) {
  return -1;
 }
 #else   // !WEBP_HAVE_GL
 int main(int argc, const char *argv[]) {
  fprintf(stderr, "OpenGL support not enabled in %s.\n", argv[0]);
  (void)argc;
  return 0;
 }
 #endif
 //------------------------------------------------------------------------------
--- a/examples/webpdec.c
+++ b/examples/webpdec.c
@ -0,0 +1,67 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // WebP decode.
 #include "./webpdec.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include "webp/decode.h"
 #include "webp/encode.h"
 #include "./example_util.h"
 #include "./metadata.h"
 int ReadWebP(const char* const in_file, WebPPicture* const pic,
             int keep_alpha, Metadata* const metadata) {
  int ok = 0;
  size_t data_size = 0;
  const uint8_t* data = NULL;
  VP8StatusCode status = VP8_STATUS_OK;
  WebPDecoderConfig config;
  WebPDecBuffer* const output_buffer = &config.output;
  WebPBitstreamFeatures* const bitstream = &config.input;
  // TODO(jzern): add Exif/XMP/ICC extraction.
  if (metadata != NULL) {
    fprintf(stderr, "Warning: metadata extraction from WebP is unsupported.\n");
  }
  if (!WebPInitDecoderConfig(&config)) {
    fprintf(stderr, "Library version mismatch!\n");
    return 0;
  }
  if (ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
    output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;
    status = ExUtilDecodeWebP(data, data_size, 0, &config);
    if (status == VP8_STATUS_OK) {
      const uint8_t* const rgba = output_buffer->u.RGBA.rgba;
      const int stride = output_buffer->u.RGBA.stride;
      pic->width = output_buffer->width;
      pic->height = output_buffer->height;
      pic->use_argb = 1;
      ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
                     : WebPPictureImportRGB(pic, rgba, stride);
    }
  }
  if (status != VP8_STATUS_OK) {
    ExUtilPrintWebPError(in_file, status);
  }
  free((void*)data);
  WebPFreeDecBuffer(output_buffer);
  return ok;
 }
 // -----------------------------------------------------------------------------
--- a/examples/webpdec.h
+++ b/examples/webpdec.h
@ -0,0 +1,33 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // WebP decode.
 #ifndef WEBP_EXAMPLES_WEBPDEC_H_
 #define WEBP_EXAMPLES_WEBPDEC_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct Metadata;
 struct WebPPicture;
 // Reads a WebP from 'in_file', returning the decoded output in 'pic'.
 // If 'keep_alpha' is true and the WebP has an alpha channel, the output is
 // RGBA otherwise it will be RGB.
 // Returns true on success.
 int ReadWebP(const char* const in_file, struct WebPPicture* const pic,
             int keep_alpha, struct Metadata* const metadata);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_EXAMPLES_WEBPDEC_H_
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -1,16 +1,15 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Simple command-line to create a WebP container file and to extract or strip
 //  relevant data from the container file.
 //
 //  Compile with:     gcc -o webpmux webpmux.c -lwebpmux -lwebp
 //
 //
 // Authors: Vikas (vikaas.arora@gmail.com),
 //          Urvang (urvang@google.com)
@ -47,13 +46,14 @@
 */
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "webp/decode.h"
 #include "webp/mux.h"
 #include "./example_util.h"
@ -130,7 +130,7 @@ static int CountOccurrences(const char* arglist[], int list_length,
  return num_occurences;
 }
-static const char* const kErrorMessages[] = {
+static const char* const kErrorMessages[-WEBP_MUX_NOT_ENOUGH_DATA + 1] = {
  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
  "WEBP_MUX_MEMORY_ERROR", "WEBP_MUX_NOT_ENOUGH_DATA"
 };
@ -146,12 +146,6 @@ static const char* ErrorString(WebPMuxError err) {
    return err;                                                      \
  }
 #define RETURN_IF_ERROR2(ERR_MSG, FORMAT_STR)                        \
  if (err != WEBP_MUX_OK) {                                          \
    fprintf(stderr, ERR_MSG, FORMAT_STR);                            \
    return err;                                                      \
  }
 #define RETURN_IF_ERROR3(ERR_MSG, FORMAT_STR1, FORMAT_STR2)          \
  if (err != WEBP_MUX_OK) {                                          \
    fprintf(stderr, ERR_MSG, FORMAT_STR1, FORMAT_STR2);              \
@ -180,9 +174,14 @@ static const char* ErrorString(WebPMuxError err) {
  } while (0)
 static WebPMuxError DisplayInfo(const WebPMux* mux) {
  int width, height;
  uint32_t flag;
-  WebPMuxError err = WebPMuxGetFeatures(mux, &flag);
+  WebPMuxError err = WebPMuxGetCanvasSize(mux, &width, &height);
  assert(err == WEBP_MUX_OK);  // As WebPMuxCreate() was successful earlier.
  printf("Canvas size: %d x %d\n", width, height);
  err = WebPMuxGetFeatures(mux, &flag);
 #ifndef WEBP_EXPERIMENTAL_FEATURES
  if (flag & FRAGMENTS_FLAG) err = WEBP_MUX_INVALID_ARGUMENT;
 #endif
@ -212,28 +211,44 @@ static WebPMuxError DisplayInfo(const WebPMux* mux) {
    if (is_anim) {
      WebPMuxAnimParams params;
      err = WebPMuxGetAnimationParams(mux, &params);
-      RETURN_IF_ERROR("Failed to retrieve animation parameters\n");
+      assert(err == WEBP_MUX_OK);
      printf("Background color : 0x%.8X  Loop Count : %d\n",
             params.bgcolor, params.loop_count);
    }
    err = WebPMuxNumChunks(mux, id, &nFrames);
-    RETURN_IF_ERROR2("Failed to retrieve number of %ss\n", type_str);
+    assert(err == WEBP_MUX_OK);
    printf("Number of %ss: %d\n", type_str, nFrames);
    if (nFrames > 0) {
      int i;
-      printf("No.: x_offset y_offset ");
+      printf("No.: width height alpha x_offset y_offset ");
-      if (is_anim) printf("duration dispose ");
+      if (is_anim) printf("duration   dispose blend ");
      printf("image_size\n");
      for (i = 1; i <= nFrames; i++) {
        WebPMuxFrameInfo frame;
        err = WebPMuxGetFrame(mux, i, &frame);
-        RETURN_IF_ERROR3("Failed to retrieve %s#%d\n", type_str, i);
+        if (err == WEBP_MUX_OK) {
-        printf("%3d: %8d %8d ", i, frame.x_offset, frame.y_offset);
+          WebPBitstreamFeatures features;
-        if (is_anim) printf("%8d %7d ", frame.duration, frame.dispose_method);
+          const VP8StatusCode status = WebPGetFeatures(
              frame.bitstream.bytes, frame.bitstream.size, &features);
          assert(status == VP8_STATUS_OK);  // Checked by WebPMuxCreate().
          (void)status;
          printf("%3d: %5d %5d %5s %8d %8d ", i, features.width,
                 features.height, features.has_alpha ? "yes" : "no",
                 frame.x_offset, frame.y_offset);
          if (is_anim) {
            const char* const dispose =
                (frame.dispose_method == WEBP_MUX_DISPOSE_NONE) ? "none"
                                                                : "background";
            const char* const blend =
                (frame.blend_method == WEBP_MUX_BLEND) ? "yes" : "no";
            printf("%8d %10s %5s ", frame.duration, dispose, blend);
          }
          printf("%10d\n", (int)frame.bitstream.size);
        }
        WebPDataClear(&frame.bitstream);
        RETURN_IF_ERROR3("Failed to retrieve %s#%d\n", type_str, i);
      }
    }
  }
@ -241,30 +256,33 @@ static WebPMuxError DisplayInfo(const WebPMux* mux) {
  if (flag & ICCP_FLAG) {
    WebPData icc_profile;
    err = WebPMuxGetChunk(mux, "ICCP", &icc_profile);
-    RETURN_IF_ERROR("Failed to retrieve the ICC profile\n");
+    assert(err == WEBP_MUX_OK);
    printf("Size of the ICC profile data: %d\n", (int)icc_profile.size);
  }
  if (flag & EXIF_FLAG) {
    WebPData exif;
    err = WebPMuxGetChunk(mux, "EXIF", &exif);
-    RETURN_IF_ERROR("Failed to retrieve the EXIF metadata\n");
+    assert(err == WEBP_MUX_OK);
    printf("Size of the EXIF metadata: %d\n", (int)exif.size);
  }
  if (flag & XMP_FLAG) {
    WebPData xmp;
    err = WebPMuxGetChunk(mux, "XMP ", &xmp);
-    RETURN_IF_ERROR("Failed to retrieve the XMP metadata\n");
+    assert(err == WEBP_MUX_OK);
    printf("Size of the XMP metadata: %d\n", (int)xmp.size);
  }
  if ((flag & ALPHA_FLAG) && !(flag & (ANIMATION_FLAG | FRAGMENTS_FLAG))) {
    WebPMuxFrameInfo image;
    err = WebPMuxGetFrame(mux, 1, &image);
-    RETURN_IF_ERROR("Failed to retrieve the image\n");
+    if (err == WEBP_MUX_OK) {
      printf("Size of the image (with alpha): %d\n", (int)image.bitstream.size);
    }
    WebPDataClear(&image.bitstream);
    RETURN_IF_ERROR("Failed to retrieve the image\n");
  }
  return WEBP_MUX_OK;
 }
@ -285,50 +303,52 @@ static void PrintHelp(void) {
  printf("\n");
  printf("GET_OPTIONS:\n");
-  printf(" Extract relevant data.\n");
+  printf(" Extract relevant data:\n");
-  printf("   icc       Get ICC profile.\n");
+  printf("   icc       get ICC profile\n");
-  printf("   exif      Get EXIF metadata.\n");
+  printf("   exif      get EXIF metadata\n");
-  printf("   xmp       Get XMP metadata.\n");
+  printf("   xmp       get XMP metadata\n");
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("   frgm n    Get nth fragment.\n");
+  printf("   frgm n    get nth fragment\n");
 #endif
-  printf("   frame n   Get nth frame.\n");
+  printf("   frame n   get nth frame\n");
  printf("\n");
  printf("SET_OPTIONS:\n");
-  printf(" Set color profile/metadata.\n");
+  printf(" Set color profile/metadata:\n");
-  printf("   icc  file.icc     Set ICC profile.\n");
+  printf("   icc  file.icc     set ICC profile\n");
-  printf("   exif file.exif    Set EXIF metadata.\n");
+  printf("   exif file.exif    set EXIF metadata\n");
-  printf("   xmp  file.xmp     Set XMP metadata.\n");
+  printf("   xmp  file.xmp     set XMP metadata\n");
  printf("   where:    'file.icc' contains the ICC profile to be set,\n");
  printf("             'file.exif' contains the EXIF metadata to be set\n");
  printf("             'file.xmp' contains the XMP metadata to be set\n");
  printf("\n");
  printf("STRIP_OPTIONS:\n");
-  printf(" Strip color profile/metadata.\n");
+  printf(" Strip color profile/metadata:\n");
-  printf("   icc       Strip ICC profile.\n");
+  printf("   icc       strip ICC profile\n");
-  printf("   exif      Strip EXIF metadata.\n");
+  printf("   exif      strip EXIF metadata\n");
-  printf("   xmp       Strip XMP metadata.\n");
+  printf("   xmp       strip XMP metadata\n");
 #ifdef WEBP_EXPERIMENTAL_FEATURES
  printf("\n");
  printf("FRAGMENT_OPTIONS(i):\n");
-  printf(" Create fragmented image.\n");
+  printf(" Create fragmented image:\n");
  printf("   file_i +xi+yi\n");
  printf("   where:    'file_i' is the i'th fragment (WebP format),\n");
-  printf("             'xi','yi' specify the image offset for this fragment."
+  printf("             'xi','yi' specify the image offset for this fragment"
         "\n");
 #endif
  printf("\n");
  printf("FRAME_OPTIONS(i):\n");
-  printf(" Create animation.\n");
+  printf(" Create animation:\n");
-  printf("   file_i +di+xi+yi+mi\n");
+  printf("   file_i +di+[xi+yi[+mi[bi]]]\n");
  printf("   where:    'file_i' is the i'th animation frame (WebP format),\n");
-  printf("             'di' is the pause duration before next frame.\n");
+  printf("             'di' is the pause duration before next frame,\n");
-  printf("             'xi','yi' specify the image offset for this frame.\n");
+  printf("             'xi','yi' specify the image offset for this frame,\n");
-  printf("             'mi' is the dispose method for this frame (0 or 1).\n");
+  printf("             'mi' is the dispose method for this frame (0 or 1),\n");
  printf("             'bi' is the blending method for this frame (+b or -b)"
         "\n");
  printf("\n");
  printf("LOOP_COUNT:\n");
@ -343,7 +363,7 @@ static void PrintHelp(void) {
         "specifying\n");
  printf("            the Alpha, Red, Green and Blue component values "
         "respectively\n");
-  printf("            [Default: 255,255,255,255].\n");
+  printf("            [Default: 255,255,255,255]\n");
  printf("\nINPUT & OUTPUT are in WebP format.\n");
@ -351,6 +371,14 @@ static void PrintHelp(void) {
  printf(" and is assumed to be\nvalid.\n");
 }
 static void WarnAboutOddOffset(const WebPMuxFrameInfo* const info) {
  if ((info->x_offset | info->y_offset) & 1) {
    fprintf(stderr, "Warning: odd offsets will be snapped to even values"
            " (%d, %d) -> (%d, %d)\n", info->x_offset, info->y_offset,
            info->x_offset & ~1, info->y_offset & ~1);
  }
 }
 static int ReadFileToWebPData(const char* const filename,
                              WebPData* const webp_data) {
  const uint8_t* data;
@ -374,8 +402,9 @@ static int CreateMux(const char* const filename, WebPMux** mux) {
 static int WriteData(const char* filename, const WebPData* const webpdata) {
  int ok = 0;
-  FILE* fout = strcmp(filename, "-") ? fopen(filename, "wb") : stdout;
+  FILE* fout = strcmp(filename, "-") ? fopen(filename, "wb")
-  if (!fout) {
+                                     : ExUtilSetBinaryMode(stdout);
  if (fout == NULL) {
    fprintf(stderr, "Error opening output WebP file %s!\n", filename);
    return 0;
  }
@ -405,27 +434,44 @@ static int WriteWebP(WebPMux* const mux, const char* filename) {
 static int ParseFrameArgs(const char* args, WebPMuxFrameInfo* const info) {
  int dispose_method, dummy;
-  const int num_args = sscanf(args, "+%d+%d+%d+%d+%d",
+  char plus_minus, blend_method;
-                              &info->duration, &info->x_offset, &info->y_offset,
+  const int num_args = sscanf(args, "+%d+%d+%d+%d%c%c+%d", &info->duration,
-                              &dispose_method, &dummy);
+                              &info->x_offset, &info->y_offset, &dispose_method,
                              &plus_minus, &blend_method, &dummy);
  switch (num_args) {
    case 1:
      info->x_offset = info->y_offset = 0;  // fall through
    case 3:
      dispose_method = 0;  // fall through
    case 4:
      plus_minus = '+';
      blend_method = 'b';  // fall through
    case 6:
      break;
    case 2:
    case 5:
    default:
      return 0;
  }
  WarnAboutOddOffset(info);
  // Note: The sanity of the following conversion is checked by
-  // WebPMuxSetAnimationParams().
+  // WebPMuxPushFrame().
  info->dispose_method = (WebPMuxAnimDispose)dispose_method;
  if (blend_method != 'b') return 0;
  if (plus_minus != '-' && plus_minus != '+') return 0;
  info->blend_method =
      (plus_minus == '+') ? WEBP_MUX_BLEND : WEBP_MUX_NO_BLEND;
  return 1;
 }
 static int ParseFragmentArgs(const char* args, WebPMuxFrameInfo* const info) {
-  return (sscanf(args, "+%d+%d", &info->x_offset, &info->y_offset) == 2);
+  const int ok =
      (sscanf(args, "+%d+%d", &info->x_offset, &info->y_offset) == 2);
  if (ok) WarnAboutOddOffset(info);
  return ok;
 }
 static int ParseBgcolorArgs(const char* args, uint32_t* const bgcolor) {
@ -442,7 +488,7 @@ static int ParseBgcolorArgs(const char* args, uint32_t* const bgcolor) {
 static void DeleteConfig(WebPMuxConfig* config) {
  if (config != NULL) {
    free(config->feature_.args_);
-    free(config);
+    memset(config, 0, sizeof(*config));
  }
 }
@ -646,6 +692,17 @@ static int ParseCommandLine(int argc, const char* argv[],
               (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
        DeleteConfig(config);
        exit(0);
      } else if (!strcmp(argv[i], "--")) {
        if (i < argc - 1) {
          ++i;
          if (config->input_ == NULL) {
            config->input_ = argv[i];
          } else {
            ERROR_GOTO2("ERROR at '%s': Multiple input files specified.\n",
                        argv[i], ErrParse);
          }
        }
        break;
      } else {
        ERROR_GOTO2("ERROR: Unknown option: '%s'.\n", argv[i], ErrParse);
      }
@ -734,33 +791,27 @@ static int ValidateConfig(WebPMuxConfig* config) {
 // Create config object from command-line arguments.
 static int InitializeConfig(int argc, const char* argv[],
-                            WebPMuxConfig** config) {
+                            WebPMuxConfig* config) {
  int num_feature_args = 0;
  int ok = 1;
  assert(config != NULL);
-  *config = NULL;
+  memset(config, 0, sizeof(*config));
  // Validate command-line arguments.
  if (!ValidateCommandLine(argc, argv, &num_feature_args)) {
    ERROR_GOTO1("Exiting due to command-line parsing error.\n", Err1);
  }
-  // Allocate memory.
+  config->feature_.arg_count_ = num_feature_args;
-  *config = (WebPMuxConfig*)calloc(1, sizeof(**config));
+  config->feature_.args_ =
-  if (*config == NULL) {
+      (FeatureArg*)calloc(num_feature_args, sizeof(*config->feature_.args_));
-    ERROR_GOTO1("ERROR: Memory allocation error.\n", Err1);
+  if (config->feature_.args_ == NULL) {
  }
  (*config)->feature_.arg_count_ = num_feature_args;
  (*config)->feature_.args_ =
      (FeatureArg*)calloc(num_feature_args, sizeof(FeatureArg));
  if ((*config)->feature_.args_ == NULL) {
    ERROR_GOTO1("ERROR: Memory allocation error.\n", Err1);
  }
  // Parse command-line.
-  if (!ParseCommandLine(argc, argv, *config) ||
+  if (!ParseCommandLine(argc, argv, config) || !ValidateConfig(config)) {
      !ValidateConfig(*config)) {
    ERROR_GOTO1("Exiting due to command-line parsing error.\n", Err1);
  }
@ -782,14 +833,16 @@ static int GetFrameFragment(const WebPMux* mux,
  WebPMux* mux_single = NULL;
  long num = 0;
  int ok = 1;
  int parse_error = 0;
  const WebPChunkId id = is_frame ? WEBP_CHUNK_ANMF : WEBP_CHUNK_FRGM;
  WebPMuxFrameInfo info;
  WebPDataInit(&info.bitstream);
-  num = strtol(config->feature_.args_[0].params_, NULL, 10);
+  num = ExUtilGetInt(config->feature_.args_[0].params_, 10, &parse_error);
  if (num < 0) {
    ERROR_GOTO1("ERROR: Frame/Fragment index must be non-negative.\n", ErrGet);
  }
  if (parse_error) goto ErrGet;
  err = WebPMuxGetFrame(mux, num, &info);
  if (err == WEBP_MUX_OK && info.id != id) err = WEBP_MUX_NOT_FOUND;
@ -815,7 +868,7 @@ static int GetFrameFragment(const WebPMux* mux,
 ErrGet:
  WebPDataClear(&info.bitstream);
  WebPMuxDelete(mux_single);
-  return ok;
+  return ok && !parse_error;
 }
 // Read and process config.
@ -877,16 +930,19 @@ static int Process(const WebPMuxConfig* config) {
                break;
              }
              case SUBTYPE_LOOP: {
-                const long loop_count =
+                int parse_error = 0;
-                    strtol(feature->args_[i].params_, NULL, 10);
+                const int loop_count =
-                if (loop_count != (int)loop_count) {
+                    ExUtilGetInt(feature->args_[i].params_, 10, &parse_error);
                if (loop_count < 0 || loop_count > 65535) {
                  // Note: This is only a 'necessary' condition for loop_count
                  // to be valid. The 'sufficient' conditioned in checked in
                  // WebPMuxSetAnimationParams() method called later.
                  ERROR_GOTO1("ERROR: Loop count must be in the range 0 to "
                              "65535.\n", Err2);
                }
-                params.loop_count = (int)loop_count;
+                ok = !parse_error;
                if (!ok) goto Err2;
                params.loop_count = loop_count;
                break;
              }
              case SUBTYPE_ANMF: {
@ -1013,14 +1069,14 @@ static int Process(const WebPMuxConfig* config) {
 // Main.
 int main(int argc, const char* argv[]) {
-  WebPMuxConfig* config;
+  WebPMuxConfig config;
  int ok = InitializeConfig(argc - 1, argv + 1, &config);
  if (ok) {
-    ok = Process(config);
+    ok = Process(&config);
  } else {
    PrintHelp();
  }
-  DeleteConfig(config);
+  DeleteConfig(&config);
  return !ok;
 }
--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -1,8 +1,10 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Windows Imaging Component (WIC) decode.
@ -10,9 +12,10 @@
 #include "./wicdec.h"
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "webp/config.h"
 #endif
 #include <assert.h>
 #include <stdio.h>
 #ifdef HAVE_WINCODEC_H
@ -107,6 +110,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
    IFS(IWICBitmapFrameDecode_GetColorContexts(frame,
                                               count, color_contexts,
                                               &num_color_contexts));
    assert(FAILED(hr) || num_color_contexts <= count);
    for (i = 0; SUCCEEDED(hr) && i < num_color_contexts; ++i) {
      WICColorContextType type;
      IFS(IWICColorContext_GetType(color_contexts[i], &type));
@ -114,7 +118,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
        UINT size;
        IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
                                             0, NULL, &size));
-        if (size > 0) {
+        if (SUCCEEDED(hr) && size > 0) {
          iccp->bytes = (uint8_t*)malloc(size);
          if (iccp->bytes == NULL) {
            hr = E_OUTOFMEMORY;
@ -126,7 +130,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
                                               &size));
          if (SUCCEEDED(hr) && size != iccp->size) {
            fprintf(stderr, "Warning! ICC profile size (%u) != expected (%u)\n",
-                    size, iccp->size);
+                    size, (uint32_t)iccp->size);
            iccp->size = size;
          }
          break;
@ -259,7 +263,7 @@ int ReadPictureWithWIC(const char* const filename,
  IFS(IWICBitmapFrameDecode_GetPixelFormat(frame, &src_pixel_format));
  IFS(IWICBitmapDecoder_GetContainerFormat(decoder, &src_container_format));
-  if (keep_alpha) {
+  if (SUCCEEDED(hr) && keep_alpha) {
    const GUID** guid;
    for (guid = kAlphaContainers; *guid != NULL; ++guid) {
      if (IsEqualGUID(MAKE_REFGUID(src_container_format),
@ -306,6 +310,7 @@ int ReadPictureWithWIC(const char* const filename,
    int ok;
    pic->width = width;
    pic->height = height;
    pic->use_argb = 1;
    ok = importer->import(pic, rgb, stride);
    if (!ok) hr = E_FAIL;
  }
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@ -1,8 +1,10 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Windows Imaging Component (WIC) decode.
@ -10,7 +12,7 @@
 #ifndef WEBP_EXAMPLES_WICDEC_H_
 #define WEBP_EXAMPLES_WICDEC_H_
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -25,7 +27,7 @@ int ReadPictureWithWIC(const char* const filename,
                       struct WebPPicture* const pic, int keep_alpha,
                       struct Metadata* const metadata);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -12,28 +12,37 @@
 set -e
 # Extract the latest SDK version from the final field of the form: iphoneosX.Y
-declare -r SDK=$(xcodebuild -showsdks \
+readonly SDK=$(xcodebuild -showsdks \
  | grep iphoneos | sort | tail -n 1 | awk '{print substr($NF, 9)}'
 )
-declare -r OLDPATH=${PATH}
+# Extract Xcode version.
 readonly XCODE=$(xcodebuild -version | grep Xcode | cut -d " " -f2)
 if [[ -z "${XCODE}" ]]; then
  echo "Xcode not available"
  exit 1
 fi
 readonly OLDPATH=${PATH}
 # Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
 # Note that iPhoneOS-V6 support is not available with the iOS6 SDK.
-declare -r PLATFORMS="iPhoneSimulator iPhoneOS-V7 iPhoneOS-V7s"
+PLATFORMS="iPhoneSimulator iPhoneSimulator64"
-declare -r SRCDIR=$(dirname $0)
+PLATFORMS+=" iPhoneOS-V7 iPhoneOS-V7s iPhoneOS-V7-arm64"
-declare -r TOPDIR=$(pwd)
+readonly PLATFORMS
-declare -r BUILDDIR="${TOPDIR}/iosbuild"
+readonly SRCDIR=$(dirname $0)
-declare -r TARGETDIR="${TOPDIR}/WebP.framework"
+readonly TOPDIR=$(pwd)
-declare -r DEVELOPER=$(xcode-select --print-path)
+readonly BUILDDIR="${TOPDIR}/iosbuild"
-declare -r PLATFORMSROOT="${DEVELOPER}/Platforms"
+readonly TARGETDIR="${TOPDIR}/WebP.framework"
-declare -r LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+readonly DEVELOPER=$(xcode-select --print-path)
 readonly PLATFORMSROOT="${DEVELOPER}/Platforms"
 readonly LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 LIBLIST=''
 if [[ -z "${SDK}" ]]; then
  echo "iOS SDK not available"
  exit 1
-elif [[ ${SDK} < 4.0 ]]; then
+elif [[ ${SDK} < 6.0 ]]; then
-  echo "You need iOS SDK version 4.0 or above"
+  echo "You need iOS SDK version 6.0 or above"
  exit 1
 else
  echo "iOS SDK Version ${SDK}"
@ -44,10 +53,25 @@ rm -rf ${TARGETDIR}
 mkdir -p ${BUILDDIR}
 mkdir -p ${TARGETDIR}/Headers/
-[[ -e ${SRCDIR}/configure ]] || (cd ${SRCDIR} && sh autogen.sh)
+if [[ ! -e ${SRCDIR}/configure ]]; then
  if ! (cd ${SRCDIR} && sh autogen.sh); then
    cat <<EOT
 Error creating configure script!
 This script requires the autoconf/automake and libtool to build. MacPorts can
 be used to obtain these:
 http://www.macports.org/install.php
 EOT
    exit 1
  fi
 fi
 for PLATFORM in ${PLATFORMS}; do
-  if [[ "${PLATFORM}" == "iPhoneOS-V7s" ]]; then
+  ARCH2=""
  if [[ "${PLATFORM}" == "iPhoneOS-V7-arm64" ]]; then
    PLATFORM="iPhoneOS"
    ARCH="aarch64"
    ARCH2="arm64"
  elif [[ "${PLATFORM}" == "iPhoneOS-V7s" ]]; then
    PLATFORM="iPhoneOS"
    ARCH="armv7s"
  elif [[ "${PLATFORM}" == "iPhoneOS-V7" ]]; then
@ -56,6 +80,9 @@ for PLATFORM in ${PLATFORMS}; do
  elif [[ "${PLATFORM}" == "iPhoneOS-V6" ]]; then
    PLATFORM="iPhoneOS"
    ARCH="armv6"
  elif [[ "${PLATFORM}" == "iPhoneSimulator64" ]]; then
    PLATFORM="iPhoneSimulator"
    ARCH="x86_64"
  else
    ARCH="i386"
  fi
@ -63,18 +90,20 @@ for PLATFORM in ${PLATFORMS}; do
  ROOTDIR="${BUILDDIR}/${PLATFORM}-${SDK}-${ARCH}"
  mkdir -p "${ROOTDIR}"
-  export DEVROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer"
+  DEVROOT="${DEVELOPER}/Toolchains/XcodeDefault.xctoolchain"
-  export SDKROOT="${DEVROOT}/SDKs/${PLATFORM}${SDK}.sdk"
+  SDKROOT="${PLATFORMSROOT}/"
  SDKROOT+="${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
  CFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT} -O3 -DNDEBUG"
  CFLAGS+=" -miphoneos-version-min=6.0"
-  export CFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
+  set -x
  export CXXFLAGS=${CFLAGS}
  export LDFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
  ${SRCDIR}/configure --host=${ARCH}-apple-darwin --prefix=${ROOTDIR} \
    --build=$(${SRCDIR}/config.guess) \
    --disable-shared --enable-static \
-    --enable-libwebpdecoder --enable-swap-16bit-csp
+    --enable-libwebpdecoder --enable-swap-16bit-csp \
    CFLAGS="${CFLAGS}"
  set +x
  # run make only in the src/ directory to create libwebpdecoder.a
  cd src/
@ -89,5 +118,5 @@ for PLATFORM in ${PLATFORMS}; do
  export PATH=${OLDPATH}
 done
-cp -a ${SRCDIR}/src/webp/* ${TARGETDIR}/Headers/
+cp -a ${SRCDIR}/src/webp/*.h ${TARGETDIR}/Headers/
 ${LIPO} -create ${LIBLIST} -output ${TARGETDIR}/WebP
--- a/m4/ax_pthread.m4
+++ b/m4/ax_pthread.m4
@ -82,7 +82,7 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
-#serial 18
+#serial 21
 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
 AC_DEFUN([AX_PTHREAD], [
@ -103,8 +103,8 @@ if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
        save_LIBS="$LIBS"
        LIBS="$PTHREAD_LIBS $LIBS"
        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
-        AC_TRY_LINK_FUNC(pthread_join, ax_pthread_ok=yes)
+        AC_TRY_LINK_FUNC([pthread_join], [ax_pthread_ok=yes])
-        AC_MSG_RESULT($ax_pthread_ok)
+        AC_MSG_RESULT([$ax_pthread_ok])
        if test x"$ax_pthread_ok" = xno; then
                PTHREAD_LIBS=""
                PTHREAD_CFLAGS=""
@ -164,6 +164,20 @@ case ${host_os} in
        ;;
 esac
 # Clang doesn't consider unrecognized options an error unless we specify
 # -Werror. We throw in some extra Clang-specific options to ensure that
 # this doesn't happen for GCC, which also accepts -Werror.
 AC_MSG_CHECKING([if compiler needs -Werror to reject unknown flags])
 save_CFLAGS="$CFLAGS"
 ax_pthread_extra_flags="-Werror"
 CFLAGS="$CFLAGS $ax_pthread_extra_flags -Wunknown-warning-option -Wsizeof-array-argument"
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([int foo(void);],[foo()])],
                  [AC_MSG_RESULT([yes])],
                  [ax_pthread_extra_flags=
                   AC_MSG_RESULT([no])])
 CFLAGS="$save_CFLAGS"
 if test x"$ax_pthread_ok" = xno; then
 for flag in $ax_pthread_flags; do
@ -178,7 +192,7 @@ for flag in $ax_pthread_flags; do
                ;;
                pthread-config)
-                AC_CHECK_PROG(ax_pthread_config, pthread-config, yes, no)
+                AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no])
                if test x"$ax_pthread_config" = xno; then continue; fi
                PTHREAD_CFLAGS="`pthread-config --cflags`"
                PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
@ -193,7 +207,7 @@ for flag in $ax_pthread_flags; do
        save_LIBS="$LIBS"
        save_CFLAGS="$CFLAGS"
        LIBS="$PTHREAD_LIBS $LIBS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS $ax_pthread_extra_flags"
        # Check for various functions.  We must include pthread.h,
        # since some functions may be macros.  (On the Sequent, we
@ -219,7 +233,7 @@ for flag in $ax_pthread_flags; do
        LIBS="$save_LIBS"
        CFLAGS="$save_CFLAGS"
-        AC_MSG_RESULT($ax_pthread_ok)
+        AC_MSG_RESULT([$ax_pthread_ok])
        if test "x$ax_pthread_ok" = xyes; then
                break;
        fi
@ -245,9 +259,9 @@ if test "x$ax_pthread_ok" = xyes; then
                [attr_name=$attr; break],
                [])
        done
-        AC_MSG_RESULT($attr_name)
+        AC_MSG_RESULT([$attr_name])
        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
-            AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
+            AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], [$attr_name],
                               [Define to necessary symbol if this constant
                                uses a non-standard name on your system.])
        fi
@ -261,45 +275,54 @@ if test "x$ax_pthread_ok" = xyes; then
            if test "$GCC" = "yes"; then
                flag="-D_REENTRANT"
            else
                # TODO: What about Clang on Solaris?
                flag="-mt -D_REENTRANT"
            fi
            ;;
        esac
-        AC_MSG_RESULT(${flag})
+        AC_MSG_RESULT([$flag])
        if test "x$flag" != xno; then
            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
        fi
        AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
-            ax_cv_PTHREAD_PRIO_INHERIT, [
+            [ax_cv_PTHREAD_PRIO_INHERIT], [
-                AC_LINK_IFELSE([
+                AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],
-                    AC_LANG_PROGRAM([[#include <pthread.h>]], [[int i = PTHREAD_PRIO_INHERIT;]])],
+                                                [[int i = PTHREAD_PRIO_INHERIT;]])],
                    [ax_cv_PTHREAD_PRIO_INHERIT=yes],
                    [ax_cv_PTHREAD_PRIO_INHERIT=no])
            ])
        AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes"],
-            AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], 1, [Have PTHREAD_PRIO_INHERIT.]))
+            [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])])
        LIBS="$save_LIBS"
        CFLAGS="$save_CFLAGS"
-        # More AIX lossage: must compile with xlc_r or cc_r
+        # More AIX lossage: compile with *_r variant
-        if test x"$GCC" != xyes; then
+        if test "x$GCC" != xyes; then
-          AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
+            case $host_os in
-        else
+                aix*)
-          PTHREAD_CC=$CC
+                AS_CASE(["x/$CC"],
                  [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
                  [#handle absolute path differently from PATH based program lookup
                   AS_CASE(["x$CC"],
                     [x/*],
                     [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])],
                     [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])])
                ;;
            esac
        fi
 else
        PTHREAD_CC="$CC"
 fi
-AC_SUBST(PTHREAD_LIBS)
+test -n "$PTHREAD_CC" || PTHREAD_CC="$CC"
-AC_SUBST(PTHREAD_CFLAGS)
+
-AC_SUBST(PTHREAD_CC)
+AC_SUBST([PTHREAD_LIBS])
 AC_SUBST([PTHREAD_CFLAGS])
 AC_SUBST([PTHREAD_CC])
 # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
 if test x"$ax_pthread_ok" = xyes; then
-        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
+        ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1])
        :
 else
        ax_pthread_ok=no
--- a/makefile.unix
+++ b/makefile.unix
@ -24,6 +24,7 @@ ifeq ($(strip $(shell uname)), Darwin)
  # cf., src/enc/yuv.[hc]
  # Failure observed with: gcc 4.2.1 and 4.0.1.
  EXTRA_FLAGS += -fno-common
  EXTRA_FLAGS += -DHAVE_GLUT_GLUT_H
  EXTRA_FLAGS += -I/opt/local/include
  EXTRA_LIBS  += -L/opt/local/lib
  GL_LIBS = -framework GLUT -framework OpenGL
@ -66,8 +67,20 @@ EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
 EXTRA_FLAGS += -Wformat-security -Wformat-nonliteral
 # EXTRA_FLAGS += -Wvla
 # AVX2-specific flags:
 ifeq ($(HAVE_AVX2), 1)
 EXTRA_FLAGS += -DWEBP_HAVE_AVX2
 src/dsp/%_avx2.o: EXTRA_FLAGS += -mavx2
 endif
 # NEON-specific flags:
 # EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
 # -> seems to make the overall lib slower: -fno-split-wide-types
 #### Nothing should normally be changed below this line ####
 AR = ar
@ -86,7 +99,6 @@ DEC_OBJS = \
    src/dec/frame.o \
    src/dec/idec.o \
    src/dec/io.o \
    src/dec/layer.o \
    src/dec/quant.o \
    src/dec/tree.o \
    src/dec/vp8.o \
@ -97,18 +109,29 @@ DEMUX_OBJS = \
    src/demux/demux.o \
 DSP_DEC_OBJS = \
    src/dsp/alpha_processing.o \
    src/dsp/alpha_processing_sse2.o \
    src/dsp/cpu.o \
    src/dsp/dec.o \
    src/dsp/dec_clip_tables.o \
    src/dsp/dec_mips32.o \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
    src/dsp/lossless.o \
    src/dsp/lossless_mips32.o \
    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
    src/dsp/upsampling.o \
    src/dsp/upsampling_neon.o \
    src/dsp/upsampling_sse2.o \
    src/dsp/yuv.o \
    src/dsp/yuv_mips32.o \
    src/dsp/yuv_sse2.o \
 DSP_ENC_OBJS = \
    src/dsp/enc.o \
    src/dsp/enc_avx2.o \
    src/dsp/enc_mips32.o \
    src/dsp/enc_neon.o \
    src/dsp/enc_sse2.o \
@ -122,8 +145,11 @@ ENC_OBJS = \
    src/enc/frame.o \
    src/enc/histogram.o \
    src/enc/iterator.o \
    src/enc/layer.o \
    src/enc/picture.o \
    src/enc/picture_csp.o \
    src/enc/picture_psnr.o \
    src/enc/picture_rescale.o \
    src/enc/picture_tools.o \
    src/enc/quant.o \
    src/enc/syntax.o \
    src/enc/token.o \
@ -136,10 +162,14 @@ EX_FORMAT_DEC_OBJS = \
    examples/metadata.o \
    examples/pngdec.o \
    examples/tiffdec.o \
    examples/webpdec.o \
 EX_UTIL_OBJS = \
    examples/example_util.o \
 GIF2WEBP_UTIL_OBJS = \
    examples/gif2webp_util.o \
 MUX_OBJS = \
    src/mux/muxedit.o \
    src/mux/muxinternal.o \
@ -151,6 +181,7 @@ UTILS_DEC_OBJS = \
    src/utils/filters.o \
    src/utils/huffman.o \
    src/utils/quant_levels_dec.o \
    src/utils/random.o \
    src/utils/rescaler.o \
    src/utils/thread.o \
    src/utils/utils.o \
@ -175,15 +206,22 @@ HDRS_INSTALLED = \
    src/webp/types.h \
 HDRS = \
    src/dec/alphai.h \
    src/dec/decode_vp8.h \
    src/dec/vp8i.h \
    src/dec/vp8li.h \
    src/dec/webpi.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
    src/dsp/neon.h \
    src/dsp/yuv.h \
    src/dsp/yuv_tables_sse2.h \
    src/enc/backward_references.h \
    src/enc/cost.h \
    src/enc/histogram.h \
    src/enc/vp8enci.h \
    src/enc/vp8li.h \
    src/mux/muxi.h \
    src/utils/bit_reader.h \
    src/utils/bit_writer.h \
    src/utils/color_cache.h \
@ -192,8 +230,10 @@ HDRS = \
    src/utils/huffman_encode.h \
    src/utils/quant_levels.h \
    src/utils/quant_levels_dec.h \
    src/utils/random.h \
    src/utils/rescaler.h \
    src/utils/thread.h \
    src/utils/utils.h \
    src/webp/format_constants.h \
    $(HDRS_INSTALLED) \
@ -205,6 +245,7 @@ OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
  OUTPUT += $(EXTRA_EXAMPLES)
  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
  OUTPUT += examples/libgif2webp_util.a
 endif
 ex: $(OUT_EXAMPLES)
@ -212,10 +253,19 @@ all: ex $(EXTRA_EXAMPLES)
 $(EX_FORMAT_DEC_OBJS): %.o: %.h
 # special dependencies:
 #   tree.c/vp8.c/bit_reader.c <-> bit_reader_inl.h, endian_inl.h
 #   bit_writer.c <-> endian_inl.h
 src/dec/tree.o: src/utils/bit_reader_inl.h src/utils/endian_inl.h
 src/dec/vp8.o: src/utils/bit_reader_inl.h src/utils/endian_inl.h
 src/utils/bit_reader.o: src/utils/bit_reader_inl.h src/utils/endian_inl.h
 src/utils/bit_writer.o: src/utils/endian_inl.h
 %.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@
 examples/libexample_util.a: $(EX_UTIL_OBJS)
 examples/libgif2webp_util.a: $(GIF2WEBP_UTIL_OBJS)
 src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
 src/libwebp.a: $(LIBWEBP_OBJS)
 src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
@ -230,15 +280,18 @@ examples/gif2webp: examples/gif2webp.o
 examples/vwebp: examples/vwebp.o
 examples/webpmux: examples/webpmux.o
-examples/cwebp: src/libwebp.a
+examples/cwebp: examples/libexample_util.a src/libwebp.a
 examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
 examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
 examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
-examples/gif2webp: examples/libexample_util.a src/mux/libwebpmux.a src/libwebp.a
+examples/gif2webp: examples/libexample_util.a examples/libgif2webp_util.a
 examples/gif2webp: src/mux/libwebpmux.a src/libwebp.a
 examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
 examples/gif2webp: EXTRA_FLAGS += -DWEBP_HAVE_GIF
 examples/vwebp: examples/libexample_util.a src/demux/libwebpdemux.a
 examples/vwebp: src/libwebp.a
 examples/vwebp: EXTRA_LIBS += $(GL_LIBS)
 examples/vwebp: EXTRA_FLAGS += -DWEBP_HAVE_GL
 examples/webpmux: examples/libexample_util.a src/mux/libwebpmux.a
 examples/webpmux: src/libwebpdecoder.a
@ -249,14 +302,14 @@ dist: DESTDIR := dist
 dist: OUT_EXAMPLES += $(EXTRA_EXAMPLES)
 dist: all
 	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
-	           $(DESTDIR)/doc $(DESTDIR)/lib
+	           $(DESTDIR)/bin $(DESTDIR)/doc $(DESTDIR)/lib
-	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)
+	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)/bin
 	$(INSTALL) -m644 $(HDRS_INSTALLED) $(DESTDIR)/include/webp
 	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
 	$(INSTALL) -m644 src/demux/libwebpdemux.a $(DESTDIR)/lib
 	$(INSTALL) -m644 src/mux/libwebpmux.a $(DESTDIR)/lib
 	umask 022; \
-	for m in man/[cd]webp.1 man/gif2webp.1 man/webpmux.1; do \
+	for m in man/[cdv]webp.1 man/gif2webp.1 man/webpmux.1; do \
 	  basenam=$$(basename $$m .1); \
 	  $(GROFF) -t -e -man -T utf8 $$m \
 	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
--- a/man/Makefile.am
+++ b/man/Makefile.am
@ -5,4 +5,7 @@ endif
 if BUILD_GIF2WEBP
  man_MANS += gif2webp.1
 endif
 if BUILD_VWEBP
  man_MANS += vwebp.1
 endif
 EXTRA_DIST = $(man_MANS)
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "March 13, 2013"
+.TH CWEBP 1 "Oct 13, 2014"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -12,13 +12,19 @@ This manual page documents the
 command.
 .PP
 \fBcwebp\fP compresses an image using the WebP format.
-Input format can be either PNG, JPEG, TIFF or raw Y'CbCr samples.
+Input format can be either PNG, JPEG, TIFF, WebP or raw Y'CbCr samples.
 .SH OPTIONS
 The basic options are:
 .TP
 .BI \-o " string
 Specify the name of the output WebP file. If omitted, \fBcwebp\fP will
 perform compression but only report statistics.
 Using "\-" as output name will direct output to 'stdout'.
 .TP
 .BI \-\- " string
 Explicitly specify the input file. This option is useful if the input
 file starts with an '\-' for instance. This option must appear \fBlast\fP.
 Any other options afterward will be ignored.
 .TP
 .B \-h, \-help
 A short usage summary.
@ -39,6 +45,15 @@ with lower quality. Best quality is achieved by using a value of 100.
 In case of lossless compression (specified by the \-lossless option), a small
 factor enables faster compression speed, but produces a larger file. Maximum
 compression is achieved by using a value of 100.
 .\" TODO(jzern): restore post-v0.4.1
 .\" .TP
 .\" .BI \-z " int
 .\" Switch on \fBlossless\fP compression mode with the specified level between 0
 .\" and 9, with level 0 being the fastest, 9 being the slowest. Fast mode
 .\" produces larger file size than slower ones. A good default is \-z 6.
 .\" This option is actually a shortcut for some predefined settings for quality
 .\" and method. If options \-q  or \-m are subsequently used, they will invalidate
 .\" the effect of this \-z option.
 .TP
 .BI \-alpha_q " int
 Specify the compression factor for alpha compression between 0 and 100.
@ -73,7 +88,7 @@ trade off between encoding speed and the compressed file size and quality.
 Possible values range from 0 to 6. Default value is 4.
 When higher values are used, the encoder will spend more time inspecting
 additional encoding possibilities and decide on the quality gain.
-Lower value can result is faster processing time at the expense of
+Lower value can result in faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
 .B \-jpeg_like
@ -153,6 +168,11 @@ close as possible to this target.
 Set a maximum number of passes to use during the dichotomy used by
 options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
 .BI \-resize " width height
 Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
 If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
 the value will be calculated preserving the aspect-ratio.
 .TP
 .BI \-crop " x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
@ -168,8 +188,9 @@ Output additional ASCII-map of encoding information. Possible map values
 range from 1 to 6. This is only meant to help debugging.
 .TP
 .BI \-pre " int
-Specify a pre-processing filter. This option is a placeholder
+Specify some pre-processing steps. Using a value of '2' will trigger
-and has currently no effect.
+quality-dependent pseudo-random dithering during RGBA->YUVA conversion
 (lossy compression only).
 .TP
 .BI \-alpha_filter " string
 Specify the predictive filtering method for the alpha plane. One of 'none',
@ -187,6 +208,11 @@ no compression, 1 uses WebP lossless format for compression. The default is 1.
 Modify unseen RGB values under fully transparent area, to help compressibility.
 The default is off.
 .TP
 .BI \-blend_alpha " int
 This option blends the alpha channel (if present) with the source using the
 background color specified in hexadecimal as 0xrrggbb. The alpha channel is
 afterward reset to the opaque value 255.
 .TP
 .B \-noalpha
 Using this option will discard the alpha channel.
 .TP
@ -244,6 +270,8 @@ cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
 cwebp \-q 70 picture_with_alpha.png \-o picture_with_alpha.webp
 .br
 cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
 .br
 cwebp \-o picture.webp \-\- \-\-\-picture.png
 .SH AUTHORS
 \fBcwebp\fP was written by the WebP team.
@ -255,7 +283,7 @@ for the Debian project (and may be used by others).
 .SH SEE ALSO
 .BR dwebp (1),
-.BR gif2webp (1).
+.BR gif2webp (1)
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "February 01, 2013"
+.TH DWEBP 1 "July 22, 2014"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -23,6 +23,19 @@ Print the version number (as major.minor.revision) and exit.
 .TP
 .BI \-o " string
 Specify the name of the output file (as PNG format by default).
 Using "-" as output name will direct output to 'stdout'.
 .TP
 .BI \-\- " string
 Explicitly specify the input file. This option is useful if the input
 file starts with an '\-' for instance. This option must appear \fBlast\fP.
 Any other options afterward will be ignored. If the input file is "\-",
 the data will be read from \fIstdin\fP instead of a file.
 .TP
 .B \-bmp
 Change the output format to uncompressed BMP.
 .TP
 .B \-tiff
 Change the output format to uncompressed TIFF.
 .TP
 .B \-pam
 Change the output format to PAM (retains alpha).
@ -32,7 +45,7 @@ Change the output format to PPM (discards alpha).
 .TP
 .B \-pgm
 Change the output format to PGM. The output consists of luma/chroma
-samples instead of RGB, using the ICM4 layout. This option is mainly
+samples instead of RGB, using the IMC4 layout. This option is mainly
 for verification and debugging purposes.
 .TP
 .B \-yuv
@ -48,7 +61,21 @@ edges (especially the red ones), but should be faster.
 .B \-nofilter
 Don't use the in-loop filtering process even if it is required by
 the bitstream. This may produce visible blocks on the non-compliant output,
-but will make the decoding faster.
+but it will make the decoding faster.
 .TP
 .BI \-dither " strength
 Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts.
 .\" TODO(jzern): restore post-v0.4.1
 .\" .TP
 .\" .BI \-alpha_dither
 .\" If the compressed file contains a transparency plane that was quantized
 .\" during compression, this flag will allow dithering the reconstructed plane
 .\" in order to generate smoother transparency gradients.
 .TP
 .B \-nodither
 Disable all dithering (default).
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
@ -60,6 +87,10 @@ This cropping area must be fully contained within the source rectangle.
 The top-left corner will be snapped to even coordinates if needed.
 This option is meant to reduce the memory needed for cropping large images.
 Note: the cropping is applied \fIbefore\fP any scaling.
 .\" TODO(jzern): restore post-v0.4.1
 .\" .TP
 .\" .B \-flip
 .\" Flip decoded image vertically (can be useful for OpenGL textures for instance).
 .TP
 .BI \-scale " width height
 Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This
@ -84,6 +115,10 @@ http://www.webmproject.org/code/contribute/submitting-patches/
 dwebp picture.webp \-o output.png
 .br
 dwebp picture.webp \-ppm \-o output.ppm
 .br
 dwebp \-o output.ppm \-\- \-\-\-picture.webp
 .br
 cat picture.webp | dwebp \-o \- \-\- \- > output.ppm
 .SH AUTHORS
 \fBdwebp\fP was written by the WebP team.
@ -95,8 +130,8 @@ for the Debian project (and may be used by others).
 .SH SEE ALSO
 .BR cwebp (1),
-.BR webpmux (1),
+.BR gif2webp (1),
-.BR gif2webp (1).
+.BR webpmux (1)
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "February 01, 2013"
+.TH GIF2WEBP 1 "March 7, 2014"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -18,6 +18,7 @@ The basic options are:
 .BI \-o " string
 Specify the name of the output WebP file. If omitted, \fBgif2webp\fP will
 perform conversion but only report statistics.
 Using "\-" as output name will direct output to 'stdout'.
 .TP
 .B \-h, \-help
 Usage information.
@ -28,6 +29,10 @@ Print the version number (as major.minor.revision) and exit.
 .B \-lossy
 Encode the image using lossy compression.
 .TP
 .B \-mixed
 Mixed compression mode: optimize compression of the image by picking either
 lossy or lossless compression for each frame heuristically.
 .TP
 .BI \-q " float
 Specify the compression factor for RGB channels between 0 and 100. The default
 is 75.
@ -49,6 +54,43 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result is faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
 .BI \-kmin " int
 .TP
 .BI \-kmax " int
 Specify the minimum and maximum distance between consecutive key frames
 (independently decodable frames) in the output animation. The tool will insert
 some key frames into the output animation as needed so that this criteria is
 satisfied.
 .br
 A 'kmin' value of 0 will turn off insertion of key frames.
 Typical values are in the range 3 to 30. Default values are kmin = 9,
 kmax = 17 for lossless compression and kmin = 3, kmax = 5 for lossy compression.
 .br
 These two options are relevant only for animated images with large number of
 frames (>50).
 .br
 When lower values are used, more frames will be converted to key frames. This
 may lead to smaller number of frames required to decode a frame on average,
 thereby improving the decoding performance. But this may lead to slightly bigger
 file sizes.
 Higher values may lead to worse decoding performance, but smaller file sizes.
 .br
 Some restrictions:
 .br
 (i) kmin < kmax,
 .br
 (ii) kmin >= kmax / 2 + 1 and
 .br
 (iii) kmax - kmin <= 30.
 .br
 If any of these restrictions are not met, they will be enforced automatically.
 .TP
 .BI \-metadata " string
 A comma separated list of metadata to copy from the input to the output if
 present.
 Valid values: \fBall\fP, \fBnone\fP, \fBicc\fP, \fBxmp\fP.
 The default is \fBxmp\fP.
 .TP
 .BI \-f " int
 For lossy encoding only (specified by the \-lossy option). Specify the strength
 of the deblocking filter, between 0 (no filtering) and 100 (maximum filtering).
@ -57,6 +99,10 @@ strength of the filtering process applied after decoding the picture. The higher
 the value the smoother the picture will appear. Typical values are usually in
 the range of 20 to 50.
 .TP
 .B \-mt
 Use multi-threading for encoding, if possible. This option is only effective
 when using lossy compression.
 .TP
 .B \-v
 Print extra information.
 .TP
@ -78,6 +124,8 @@ gif2webp \-q 70 picture.gif \-o picture.webp
 gif2webp \-lossy \-m 3 picture.gif \-o picture_lossy.webp
 .br
 gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
 .br
 gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif
 .SH AUTHORS
 \fBgif2webp\fP was written by the WebP team.
@ -88,9 +136,9 @@ This manual page was written by Urvang Joshi <urvang@google.com>, for the
 Debian project (and may be used by others).
 .SH SEE ALSO
 .BR dwebp (1),
 .BR cwebp (1),
-.BR webpmux (1).
+.BR dwebp (1),
 .BR webpmux (1)
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -0,0 +1,91 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
 .TH VWEBP 1 "July 23, 2014"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
 .B vwebp
 .RI [ options ] " input_file.webp
 .br
 .SH DESCRIPTION
 This manual page documents the
 .B vwebp
 command.
 .PP
 \fBvwebp\fP decompresses a WebP file and displays it in a window using OpenGL.
 .SH OPTIONS
 .TP
 .B \-h
 Print usage summary.
 .TP
 .B \-version
 Print version number and exit.
 .TP
 .B \-noicc
 Don't use the ICC profile if present.
 .TP
 .B \-nofancy
 Don't use the fancy YUV420 upscaler.
 .TP
 .B \-nofilter
 Disable in-loop filtering.
 .TP
 .BI \-dither " strength
 Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts. Default: 50.
 .\" TODO(jzern): restore post-v0.4.1
 .\" .TP
 .\" .BI \-noalphadither
 .\" By default, quantized transparency planes are dithered during decompression,
 .\" to smooth the gradients. This flag will prevent this dithering.
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
 .TP
 .B \-info
 Display image information on top of the decoded image.
 .TP
 .BI \-\- " string
 Explicitly specify the input file. This option is useful if the input
 file starts with an '\-' for instance. This option must appear \fBlast\fP.
 Any other options afterward will be ignored. If the input file is "\-",
 the data will be read from \fIstdin\fP instead of a file.
 .TP
 .SH KEYBOARD SHORTCUTS
 .TP
 .B 'c'
 Toggle use of color profile.
 .TP
 .B 'i'
 Overlay file information.
 .TP
 .B 'q' / 'Q' / ESC
 Quit.
 .SH BUGS
 Please report all bugs to our issue tracker:
 http://code.google.com/p/webp/issues
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
 .SH EXAMPLES
 vwebp picture.webp
 .br
 vwebp picture.webp -mt -dither 0
 .br
 vwebp \-\- \-\-\-picture.webp
 .SH AUTHORS
 \fBvwebp\fP was written by the WebP team.
 .br
 The latest source tree is available at http://www.webmproject.org/code
 .PP
 This manual page was written for the Debian project (and may be used by others).
 .SH SEE ALSO
 .BR dwebp (1)
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,7 +1,8 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "March 16, 2013"
+.TH WEBPMUX 1 "August 28, 2014"
 .SH NAME
-webpmux \- command line tool to create WebP Mux/container file.
+webpmux \- create animated WebP files from non\-animated WebP images, extract
 frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
 .SH SYNOPSIS
 .B webpmux \-get
 .I GET_OPTIONS
@ -45,8 +46,8 @@ This manual page documents the
 .B webpmux
 command.
 .PP
-\fBwebpmux\fP can be used to create a WebP container file
+\fBwebpmux\fP can be used to create/extract from animated WebP files, as well as
-and extract/strip relevant data from the container file.
+to add/extract/strip XMP/EXIF metadata and ICC profile.
 .SH OPTIONS
 .SS GET_OPTIONS (\-get):
 .TP
@ -60,7 +61,7 @@ Get EXIF metadata.
 Get XMP metadata.
 .TP
 .BI frame " n
-Get nth frame.
+Get nth frame from an animated image. (n = 0 has a special meaning: last frame).
 .SS SET_OPTIONS (\-set)
 .TP
@ -91,13 +92,16 @@ Strip EXIF metadata.
 Strip XMP metadata.
 .SS FRAME_OPTIONS (\-frame)
 Create an animated WebP file from multiple (non\-animated) WebP images.
 .TP
-.I file_i +di[+xi+yi[+mi]]
+.I file_i +di[+xi+yi[+mi[bi]]]
 Where: 'file_i' is the i'th frame (WebP format), 'xi','yi' specify the image
-offset for this frame, 'di' is the pause duration before next frame and 'mi' is
+offset for this frame, 'di' is the pause duration before next frame, 'mi' is
-the dispose method for this frame (0 for NONE or 1 for BACKGROUND).
+the dispose method for this frame (0 for NONE or 1 for BACKGROUND) and 'bi' is
-'mi' can be omitted and will default to 0 (NONE).
+the blending method for this frame (+b for BLEND or \-b for NO_BLEND).
-Additionally, if 'mi' is ommitted then'xi' and 'yi' can be omitted and will
+Argument 'bi' can be omitted and will default to +b (BLEND).
 Also, 'mi' can be omitted if 'bi' is omitted and will default to 0 (NONE).
 Finally, if 'mi' and 'bi' are omitted then 'xi' and 'yi' can be omitted and will
 default to +0+0.
 .TP
 .BI \-loop " n
@ -128,34 +132,67 @@ Please report all bugs to our issue tracker:
 http://code.google.com/p/webp/issues
 .br
 Patches welcome! See this page to get started:
-http://www.webmproject.org/code/contribute/submitting-patches/
+http://www.webmproject.org/code/contribute/submitting\-patches/
 .SH EXAMPLES
 .P
 Add ICC profile:
 .br
 webpmux \-set icc image_profile.icc in.webp \-o icc_container.webp
 .P
 Extract ICC profile:
 .br
 webpmux \-get icc icc_container.webp \-o image_profile.icc
 .P
 Strip ICC profile:
 .br
 webpmux \-strip icc icc_container.webp \-o without_icc.webp
 .P
 Add XMP metadata:
 .br
 webpmux \-set xmp image_metadata.xmp in.webp \-o xmp_container.webp
 .P
 Extract XMP metadata:
 .br
 webpmux \-get xmp xmp_container.webp \-o image_metadata.xmp
 .P
 Strip XMP metadata:
 .br
 webpmux \-strip xmp xmp_container.webp \-o without_xmp.webp
 .P
 Add EXIF metadata:
 .br
 webpmux \-set exif image_metadata.exif in.webp \-o exif_container.webp
 .P
 Extract EXIF metadata:
 .br
 webpmux \-get exif exif_container.webp \-o image_metadata.exif
 .P
 Strip EXIF metadata:
 .br
 webpmux \-strip exif exif_container.webp \-o without_exif.webp
 .P
 Create an animated WebP file from 3 (non\-animated) WebP images:
 .br
-webpmux \-frame anim_1.webp +100 \-frame anim_2.webp +100+50+50 \-loop 10
+webpmux \-frame 1.webp +100 \-frame 2.webp +100+50+50
 .br
 .RS 8
-\-bgcolor 255,255,255,255 \-o anim_container.webp
+\-frame 3.webp +100+50+50+1+b \-loop 10 \-bgcolor 255,255,255,255
 .br
 \-o anim_container.webp
 .RE
 .P
 Get the 2nd frame from an animated WebP file:
 .br
 webpmux \-get frame 2 anim_container.webp \-o frame_2.webp
 .P
 Using \-get/\-set/\-strip with input file name starting with '\-':
 .br
 webpmux \-set icc image_profile.icc \-o icc_container.webp \-\- \-\-\-in.webp
 .br
 webpmux \-get icc \-o image_profile.icc \-\- \-\-\-icc_container.webp
 .br
 webpmux \-strip icc \-o without_icc.webp \-\- \-\-\-icc_container.webp
 .SH AUTHORS
 \fBwebpmux\fP is written by the WebP team.
@ -166,9 +203,9 @@ This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
 for the Debian project (and may be used by others).
 .SH SEE ALSO
 .BR dwebp (1),
 .BR cwebp (1),
-.BR gif2webp (1).
+.BR dwebp (1),
 .BR gif2webp (1)
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/src/.gitignore
+++ b/src/.gitignore
@ -1 +0,0 @@
 /*.pc
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -8,7 +8,6 @@ if WANT_DEMUX
  SUBDIRS += demux
 endif
 AM_CPPFLAGS = -I$(top_srcdir)/src
 lib_LTLIBRARIES = libwebp.la
 if BUILD_LIBWEBPDECODER
@ -36,7 +35,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 4:2:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 5:3:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc
@ -48,7 +47,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la
-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 0:0:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:3:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -1,14 +1,13 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
 noinst_LTLIBRARIES = libwebpdecode.la
 libwebpdecode_la_SOURCES =
 libwebpdecode_la_SOURCES += alpha.c
 libwebpdecode_la_SOURCES += alphai.h
 libwebpdecode_la_SOURCES += buffer.c
 libwebpdecode_la_SOURCES += decode_vp8.h
 libwebpdecode_la_SOURCES += frame.c
 libwebpdecode_la_SOURCES += idec.c
 libwebpdecode_la_SOURCES += io.c
 libwebpdecode_la_SOURCES += layer.c
 libwebpdecode_la_SOURCES += quant.c
 libwebpdecode_la_SOURCES += tree.c
 libwebpdecode_la_SOURCES += vp8.c
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane decompression.
@ -10,120 +12,154 @@
 // Author: Skal (pascal.massimino@gmail.com)
 #include <stdlib.h>
 #include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "../utils/filters.h"
 #include "../utils/quant_levels_dec.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+//------------------------------------------------------------------------------
-extern "C" {
+// ALPHDecoder object.
 #endif
-// TODO(skal): move to dsp/ ?
+ALPHDecoder* ALPHNew(void) {
-static void CopyPlane(const uint8_t* src, int src_stride,
+  ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
-                      uint8_t* dst, int dst_stride, int width, int height) {
+  return dec;
-  while (height-- > 0) {
+}
-    memcpy(dst, src, width);
+
-    src += src_stride;
+void ALPHDelete(ALPHDecoder* const dec) {
-    dst += dst_stride;
+  if (dec != NULL) {
    VP8LDelete(dec->vp8l_dec_);
    dec->vp8l_dec_ = NULL;
    WebPSafeFree(dec);
  }
 }
 //------------------------------------------------------------------------------
-// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
+// Decoding.
 // The 'output' buffer should be pre-allocated and must be of the same
 // dimension 'height'x'stride', as that of the image.
 //
 // Returns 1 on successfully decoding the compressed alpha and
 //         0 if either:
 //           error in bit-stream header (invalid compression mode or filter), or
 //           error returned by appropriate compression method.
-static int DecodeAlpha(const uint8_t* data, size_t data_size,
+// Initialize alpha decoding by parsing the alpha header and decoding the image
-                       int width, int height, int stride, uint8_t* output) {
+// header for alpha data stored using lossless compression.
-  uint8_t* decoded_data = NULL;
+// Returns false in case of error in alpha header (data too short, invalid
-  const size_t decoded_size = height * width;
+// compression method or filter, error in lossless header data etc).
-  WEBP_FILTER_TYPE filter;
+static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
-  int pre_processing;
+                    size_t data_size, int width, int height, uint8_t* output) {
  int rsrv;
  int ok = 0;
-  int method;
+  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
  int rsrv;
-  assert(width > 0 && height > 0 && stride >= width);
+  assert(width > 0 && height > 0);
  assert(data != NULL && output != NULL);
  dec->width_ = width;
  dec->height_ = height;
  if (data_size <= ALPHA_HEADER_LEN) {
    return 0;
  }
-  method = (data[0] >> 0) & 0x03;
+  dec->method_ = (data[0] >> 0) & 0x03;
-  filter = (data[0] >> 2) & 0x03;
+  dec->filter_ = (data[0] >> 2) & 0x03;
-  pre_processing = (data[0] >> 4) & 0x03;
+  dec->pre_processing_ = (data[0] >> 4) & 0x03;
  rsrv = (data[0] >> 6) & 0x03;
-  if (method < ALPHA_NO_COMPRESSION ||
+  if (dec->method_ < ALPHA_NO_COMPRESSION ||
-      method > ALPHA_LOSSLESS_COMPRESSION ||
+      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
-      filter >= WEBP_FILTER_LAST ||
+      dec->filter_ >= WEBP_FILTER_LAST ||
-      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
      rsrv != 0) {
    return 0;
  }
-  if (method == ALPHA_NO_COMPRESSION) {
+  if (dec->method_ == ALPHA_NO_COMPRESSION) {
-    ok = (data_size >= decoded_size);
+    const size_t alpha_decoded_size = dec->width_ * dec->height_;
-    decoded_data = (uint8_t*)data + ALPHA_HEADER_LEN;
+    ok = (alpha_data_size >= alpha_decoded_size);
  } else {
-    decoded_data = (uint8_t*)malloc(decoded_size);
+    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
-    if (decoded_data == NULL) return 0;
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
    ok = VP8LDecodeAlphaImageStream(width, height,
                                    data + ALPHA_HEADER_LEN,
                                    data_size - ALPHA_HEADER_LEN,
                                    decoded_data);
  }
  if (ok) {
    WebPUnfilterFunc unfilter_func = WebPUnfilters[filter];
    if (unfilter_func != NULL) {
      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
      // and apply filter per image-row.
      unfilter_func(width, height, width, decoded_data);
    }
    // Construct raw_data (height x stride) from alpha data (height x width).
    CopyPlane(decoded_data, width, output, stride, width, height);
    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
      ok = DequantizeLevels(decoded_data, width, height);
    }
  }
  if (method != ALPHA_NO_COMPRESSION) {
    free(decoded_data);
  }
  return ok;
 }
 // Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
 // starting from row number 'row'. It assumes that rows up to (row - 1) have
 // already been decoded.
 // Returns false in case of bitstream error.
 static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
  ALPHDecoder* const alph_dec = dec->alph_dec_;
  const int width = alph_dec->width_;
  const int height = alph_dec->height_;
  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
  uint8_t* const output = dec->alpha_plane_;
  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
    const size_t offset = row * width;
    const size_t num_pixels = num_rows * width;
    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
    memcpy(dec->alpha_plane_ + offset,
           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
    assert(alph_dec->vp8l_dec_ != NULL);
    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
      return 0;
    }
  }
  if (unfilter_func != NULL) {
    unfilter_func(width, height, width, row, num_rows, output);
  }
  if (row + num_rows == dec->pic_hdr_.height_) {
    dec->is_alpha_decoded_ = 1;
  }
  return 1;
 }
 //------------------------------------------------------------------------------
 // Main entry point.
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows) {
-  const int stride = dec->pic_hdr_.width_;
+  const int width = dec->pic_hdr_.width_;
  const int height = dec->pic_hdr_.height_;
-  if (row < 0 || num_rows < 0 || row + num_rows > dec->pic_hdr_.height_) {
+  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
    return NULL;    // sanity check.
  }
  if (row == 0) {
-    // Decode everything during the first call.
+    // Initialize decoding.
-    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
+    assert(dec->alpha_plane_ != NULL);
-                     dec->pic_hdr_.width_, dec->pic_hdr_.height_, stride,
+    dec->alph_dec_ = ALPHNew();
-                     dec->alpha_plane_)) {
+    if (dec->alph_dec_ == NULL) return NULL;
-      return NULL;  // Error.
+    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
                  width, height, dec->alpha_plane_)) {
      ALPHDelete(dec->alph_dec_);
      dec->alph_dec_ = NULL;
      return NULL;
    }
    // if we allowed use of alpha dithering, check whether it's needed at all
    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
      dec->alpha_dithering_ = 0;  // disable dithering
    } else {
      num_rows = height;          // decode everything in one pass
    }
  }
  if (!dec->is_alpha_decoded_) {
    int ok = 0;
    assert(dec->alph_dec_ != NULL);
    ok = ALPHDecode(dec, row, num_rows);
    if (ok && dec->alpha_dithering_ > 0) {
      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
                                dec->alpha_dithering_);
    }
    if (!ok || dec->is_alpha_decoded_) {
      ALPHDelete(dec->alph_dec_);
      dec->alph_dec_ = NULL;
    }
    if (!ok) return NULL;  // Error.
  }
  // Return a pointer to the current decoded row.
-  return dec->alpha_plane_ + row * stride;
+  return dec->alpha_plane_ + row * width;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/alphai.h
+++ b/src/dec/alphai.h
@ -0,0 +1,55 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha decoder: internal header.
 //
 // Author: Urvang (urvang@google.com)
 #ifndef WEBP_DEC_ALPHAI_H_
 #define WEBP_DEC_ALPHAI_H_
 #include "./webpi.h"
 #include "../utils/filters.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct VP8LDecoder;  // Defined in dec/vp8li.h.
 typedef struct ALPHDecoder ALPHDecoder;
 struct ALPHDecoder {
  int width_;
  int height_;
  int method_;
  WEBP_FILTER_TYPE filter_;
  int pre_processing_;
  struct VP8LDecoder* vp8l_dec_;
  VP8Io io_;
  int use_8b_decode;  // Although alpha channel requires only 1 byte per
                      // pixel, sometimes VP8LDecoder may need to allocate
                      // 4 bytes per pixel internally during decode.
 };
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 // Allocates a new alpha decoder instance.
 ALPHDecoder* ALPHNew(void);
 // Clears and deallocates an alpha decoder instance.
 void ALPHDelete(ALPHDecoder* const dec);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DEC_ALPHAI_H_ */
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Everything about WebPDecBuffer
@ -15,10 +17,6 @@
 #include "./webpi.h"
 #include "../utils/utils.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // WebPDecBuffer
@ -44,29 +42,34 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    ok = 0;
  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const uint64_t y_size = (uint64_t)buf->y_stride * height;
+    const int y_stride = abs(buf->y_stride);
-    const uint64_t u_size = (uint64_t)buf->u_stride * ((height + 1) / 2);
+    const int u_stride = abs(buf->u_stride);
-    const uint64_t v_size = (uint64_t)buf->v_stride * ((height + 1) / 2);
+    const int v_stride = abs(buf->v_stride);
-    const uint64_t a_size = (uint64_t)buf->a_stride * height;
+    const int a_stride = abs(buf->a_stride);
    const uint64_t y_size = (uint64_t)y_stride * height;
    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
    const uint64_t a_size = (uint64_t)a_stride * height;
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
-    ok &= (buf->y_stride >= width);
+    ok &= (y_stride >= width);
-    ok &= (buf->u_stride >= (width + 1) / 2);
+    ok &= (u_stride >= (width + 1) / 2);
-    ok &= (buf->v_stride >= (width + 1) / 2);
+    ok &= (v_stride >= (width + 1) / 2);
    ok &= (buf->y != NULL);
    ok &= (buf->u != NULL);
    ok &= (buf->v != NULL);
    if (mode == MODE_YUVA) {
-      ok &= (buf->a_stride >= width);
+      ok &= (a_stride >= width);
      ok &= (a_size <= buf->a_size);
      ok &= (buf->a != NULL);
    }
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    const uint64_t size = (uint64_t)buf->stride * height;
+    const int stride = abs(buf->stride);
    const uint64_t size = (uint64_t)stride * height;
    ok &= (size <= buf->size);
-    ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
@ -133,9 +136,35 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  return CheckDecBuffer(buffer);
 }
 VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
  if (buffer == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }
  if (WebPIsRGBMode(buffer->colorspace)) {
    WebPRGBABuffer* const buf = &buffer->u.RGBA;
    buf->rgba += (buffer->height - 1) * buf->stride;
    buf->stride = -buf->stride;
  } else {
    WebPYUVABuffer* const buf = &buffer->u.YUVA;
    const int H = buffer->height;
    buf->y += (H - 1) * buf->y_stride;
    buf->y_stride = -buf->y_stride;
    buf->u += ((H - 1) >> 1) * buf->u_stride;
    buf->u_stride = -buf->u_stride;
    buf->v += ((H - 1) >> 1) * buf->v_stride;
    buf->v_stride = -buf->v_stride;
    if (buf->a != NULL) {
      buf->a += (H - 1) * buf->a_stride;
      buf->a_stride = -buf->a_stride;
    }
  }
  return VP8_STATUS_OK;
 }
 VP8StatusCode WebPAllocateDecBuffer(int w, int h,
                                    const WebPDecoderOptions* const options,
                                    WebPDecBuffer* const out) {
  VP8StatusCode status;
  if (out == NULL || w <= 0 || h <= 0) {
    return VP8_STATUS_INVALID_PARAM;
  }
@ -162,8 +191,17 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
  out->width = w;
  out->height = h;
-  // Then, allocate buffer for real
+  // Then, allocate buffer for real.
-  return AllocateBuffer(out);
+  status = AllocateBuffer(out);
  if (status != VP8_STATUS_OK) return status;
 #if WEBP_DECODER_ABI_VERSION > 0x0203
  // Use the stride trick if vertical flip is needed.
  if (options != NULL && options->flip) {
    status = WebPFlipBuffer(out);
  }
 #endif
  return status;
 }
 //------------------------------------------------------------------------------
@ -180,8 +218,9 @@ int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
 void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
  if (buffer != NULL) {
-    if (!buffer->is_external_memory)
+    if (!buffer->is_external_memory) {
-      free(buffer->private_memory);
+      WebPSafeFree(buffer->private_memory);
    }
    buffer->private_memory = NULL;
  }
 }
@ -210,6 +249,3 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/decode_vp8.h
+++ b/src/dec/decode_vp8.h
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Low-level API for VP8 decoder
@ -14,7 +16,7 @@
 #include "../webp/decode.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -130,7 +132,8 @@ static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }
-// Start decoding a new picture. Returns true if ok.
+// Decode the VP8 frame header. Returns true if ok.
 // Note: 'io->data' must be pointing to the start of the VP8 frame header.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
@ -175,7 +178,7 @@ WEBP_EXTERN(int) VP8LGetInfo(
    const uint8_t* data, size_t data_size,  // data available so far
    int* const width, int* const height, int* const has_alpha);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Frame-reconstruction function. Memory allocation.
@ -13,12 +15,11 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #define ALIGN_MASK (32 - 1)
 static void ReconstructRow(const VP8Decoder* const dec,
                           const VP8ThreadContext* ctx);  // TODO(skal): remove
 //------------------------------------------------------------------------------
 // Filtering
@ -29,25 +30,18 @@ extern "C" {
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
 static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
  if (keyframe) {
    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
  } else {
    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
  }
 }
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int cache_id = ctx->id_;
  const int y_bps = dec->cache_y_stride_;
-  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
  const int level = f_info->f_level_;
  const int ilevel = f_info->f_ilevel_;
-  const int limit = 2 * level + ilevel;
+  const int limit = f_info->f_limit_;
-  if (level == 0) {
+  if (limit == 0) {
    return;
  }
  assert(limit >= 3);
  if (dec->filter_type_ == 1) {   // simple
    if (mb_x > 0) {
      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
@ -63,10 +57,9 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
    }
  } else {    // complex
    const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
-    const int hev_thresh =
+    const int hev_thresh = f_info->hev_thresh_;
        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
    if (mb_x > 0) {
      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
@ -126,21 +119,112 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
          }
        }
        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-        info->f_level_ = level;
+        if (level > 0) {
-
+          int ilevel = level;
          if (hdr->sharpness_ > 0) {
            if (hdr->sharpness_ > 4) {
-            level >>= 2;
+              ilevel >>= 2;
            } else {
-            level >>= 1;
+              ilevel >>= 1;
            }
-          if (level > 9 - hdr->sharpness_) {
+            if (ilevel > 9 - hdr->sharpness_) {
-            level = 9 - hdr->sharpness_;
+              ilevel = 9 - hdr->sharpness_;
            }
          }
-        info->f_ilevel_ = (level < 1) ? 1 : level;
+          if (ilevel < 1) ilevel = 1;
-        info->f_inner_ = 0;
+          info->f_ilevel_ = ilevel;
          info->f_limit_ = 2 * level + ilevel;
          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
        } else {
          info->f_limit_ = 0;  // no filtering
        }
        info->f_inner_ = i4x4;
      }
    }
  }
 }
 //------------------------------------------------------------------------------
 // Dithering
 #define DITHER_AMP_TAB_SIZE 12
 static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
  // roughly, it's dqm->uv_mat_[1]
  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
 };
 void VP8InitDithering(const WebPDecoderOptions* const options,
                      VP8Decoder* const dec) {
  assert(dec != NULL);
  if (options != NULL) {
    const int d = options->dithering_strength;
    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
    if (f > 0) {
      int s;
      int all_amp = 0;
      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
        VP8QuantMatrix* const dqm = &dec->dqm_[s];
        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
          // TODO(skal): should we specially dither more for uv_quant_ < 0?
          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
        }
        all_amp |= dqm->dither_;
      }
      if (all_amp != 0) {
        VP8InitRandom(&dec->dithering_rg_, 1.0f);
        dec->dither_ = 1;
      }
    }
 #if WEBP_DECODER_ABI_VERSION > 0x0204
    // potentially allow alpha dithering
    dec->alpha_dithering_ = options->alpha_dithering_strength;
    if (dec->alpha_dithering_ > 100) {
      dec->alpha_dithering_ = 100;
    } else if (dec->alpha_dithering_ < 0) {
      dec->alpha_dithering_ = 0;
    }
 #endif
  }
 }
 // minimal amp that will provide a non-zero dithering effect
 #define MIN_DITHER_AMP 4
 #define DITHER_DESCALE 4
 #define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
 #define DITHER_AMP_BITS 8
 #define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
 static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
  int i, j;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) {
      // TODO: could be made faster with SSE2
      const int bits =
          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
      const int v = (int)dst[i] + delta;
      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
    }
    dst += bps;
  }
 }
 static void DitherRow(VP8Decoder* const dec) {
  int mb_x;
  assert(dec->dither_);
  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
    const VP8MBData* const data = ctx->mb_data_ + mb_x;
    const int cache_id = ctx->id_;
    const int uv_bps = dec->cache_uv_stride_;
    if (data->dither_ >= MIN_DITHER_AMP) {
      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
    }
  }
 }
@ -162,25 +246,35 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int cache_id = ctx->id_;
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
  const int ysize = extra_y_rows * dec->cache_y_stride_;
  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
+  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int first_row = (ctx->mb_y_ == 0);
+  const int mb_y = ctx->mb_y_;
-  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
+  const int is_first_row = (mb_y == 0);
-  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
+  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
-  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+
  if (dec->mt_method_ == 2) {
    ReconstructRow(dec, ctx);
  }
  if (ctx->filter_row_) {
    FilterRow(dec);
  }
-  if (io->put) {
+  if (dec->dither_) {
-    if (!first_row) {
+    DitherRow(dec);
  }
  if (io->put != NULL) {
    int y_start = MACROBLOCK_VPOS(mb_y);
    int y_end = MACROBLOCK_VPOS(mb_y + 1);
    if (!is_first_row) {
      y_start -= extra_y_rows;
      io->y = ydst;
      io->u = udst;
@ -191,7 +285,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
      io->v = dec->cache_v_ + uv_offset;
    }
-    if (!last_row) {
+    if (!is_last_row) {
      y_end -= extra_y_rows;
    }
    if (y_end > io->crop_bottom) {
@ -199,11 +293,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    }
    io->a = NULL;
    if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): several things to correct here:
+      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
-      // * testing presence of alpha with dec->alpha_data_ is not a good idea
+      // good idea.
      // * we're actually decompressing the full plane only once. It should be
      //   more obvious from signature.
      // * we could free alpha_data_ right after this call, but we don't own.
      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@ -235,8 +326,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    }
  }
  // rotate top samples if needed
-  if (ctx->id_ + 1 == dec->num_caches_) {
+  if (cache_id + 1 == dec->num_caches_) {
-    if (!last_row) {
+    if (!is_last_row) {
      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@ -253,27 +344,40 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  if (!dec->use_threads_) {
+  const int filter_row =
      (dec->filter_type_ > 0) &&
      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
  if (dec->mt_method_ == 0) {
    // ctx->id_ and ctx->f_info_ are already set
    ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = dec->filter_row_;
+    ctx->filter_row_ = filter_row;
    ReconstructRow(dec, ctx);
    ok = FinishRow(dec, io);
  } else {
    WebPWorker* const worker = &dec->worker_;
    // Finish previous job *before* updating context
-    ok &= WebPWorkerSync(worker);
+    ok &= WebPGetWorkerInterface()->Sync(worker);
    assert(worker->status_ == OK);
    if (ok) {   // spawn a new deblocking/output job
      ctx->io_ = *io;
      ctx->id_ = dec->cache_id_;
      ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = dec->filter_row_;
+      ctx->filter_row_ = filter_row;
-      if (ctx->filter_row_) {    // just swap filter info
+      if (dec->mt_method_ == 2) {  // swap macroblock data
        VP8MBData* const tmp = ctx->mb_data_;
        ctx->mb_data_ = dec->mb_data_;
        dec->mb_data_ = tmp;
      } else {
        // perform reconstruction directly in main thread
        ReconstructRow(dec, ctx);
      }
      if (filter_row) {            // swap filter info
        VP8FInfo* const tmp = ctx->f_info_;
        ctx->f_info_ = dec->f_info_;
        dec->f_info_ = tmp;
      }
-      WebPWorkerLaunch(worker);
+      // (reconstruct)+filter in parallel
      WebPGetWorkerInterface()->Launch(worker);
      if (++dec->cache_id_ == dec->num_caches_) {
        dec->cache_id_ = 0;
      }
@ -287,8 +391,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Call setup() first. This may trigger additional decoding features on 'io'.
-  // Note: Afterward, we must call teardown() not matter what.
+  // Note: Afterward, we must call teardown() no matter what.
-  if (io->setup && !io->setup(io)) {
+  if (io->setup != NULL && !io->setup(io)) {
    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
    return dec->status_;
  }
@ -301,7 +405,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Define the area where we can skip in-loop filtering, in case of cropping.
  //
-  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // 'Simple' filter reads two luma samples outside of the macroblock
  // and filters one. It doesn't filter the chroma samples. Hence, we can
  // avoid doing the in-loop filtering before crop_top/crop_left position.
  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
@ -342,11 +446,11 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
-    ok = WebPWorkerSync(&dec->worker_);
+    ok = WebPGetWorkerInterface()->Sync(&dec->worker_);
  }
-  if (io->teardown) {
+  if (io->teardown != NULL) {
    io->teardown(io);
  }
  return ok;
@ -382,9 +486,9 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
  dec->cache_id_ = 0;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
    WebPWorker* const worker = &dec->worker_;
-    if (!WebPWorkerReset(worker)) {
+    if (!WebPGetWorkerInterface()->Reset(worker)) {
      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                         "thread initialization failed.");
    }
@ -399,6 +503,28 @@ static int InitThreadContext(VP8Decoder* const dec) {
  return 1;
 }
 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
                       const WebPHeaderStructure* const headers,
                       int width, int height) {
  if (options == NULL || options->use_threads == 0) {
    return 0;
  }
  (void)headers;
  (void)width;
  (void)height;
  assert(headers == NULL || !headers->is_lossless);
 #if defined(WEBP_USE_THREAD)
  if (width < MIN_WIDTH_FOR_THREADS) return 0;
  // TODO(skal): tune the heuristic further
 #if 0
  if (height < 2 * width) return 2;
 #endif
  return 2;
 #else   // !WEBP_USE_THREAD
  return 0;
 #endif
 }
 #undef MT_CACHE_LINES
 #undef ST_CACHE_LINES
@ -410,14 +536,15 @@ static int AllocateMemory(VP8Decoder* const dec) {
  const int mb_w = dec->mb_w_;
  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
  const size_t f_info_size =
      (dec->filter_type_ > 0) ?
-          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+          mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
        : 0;
  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t mb_data_size =
      (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
  const size_t cache_height = (16 * num_caches
                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const size_t cache_size = top_size * cache_height;
@ -426,13 +553,13 @@ static int AllocateMemory(VP8Decoder* const dec) {
      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
  const uint64_t needed = (uint64_t)intra_pred_mode_size
                        + top_size + mb_info_size + f_info_size
-                        + yuv_size + coeffs_size
+                        + yuv_size + mb_data_size
                        + cache_size + alpha_size + ALIGN_MASK;
  uint8_t* mem;
  if (needed != (size_t)needed) return 0;  // check for overflow
  if (needed > dec->mem_size_) {
-    free(dec->mem_);
+    WebPSafeFree(dec->mem_);
    dec->mem_size_ = 0;
    dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
    if (dec->mem_ == NULL) {
@ -447,12 +574,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
  dec->intra_t_ = (uint8_t*)mem;
  mem += intra_pred_mode_size;
-  dec->y_t_ = (uint8_t*)mem;
+  dec->yuv_t_ = (VP8TopSamples*)mem;
-  mem += 16 * mb_w;
+  mem += top_size;
  dec->u_t_ = (uint8_t*)mem;
  mem += 8 * mb_w;
  dec->v_t_ = (uint8_t*)mem;
  mem += 8 * mb_w;
  dec->mb_info_ = ((VP8MB*)mem) + 1;
  mem += mb_info_size;
@ -461,7 +584,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += f_info_size;
  dec->thread_ctx_.id_ = 0;
  dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
    // secondary cache line. The deblocking process need to make use of the
    // filtering strength from previous macroblock row, while the new ones
    // are being decoded in parallel. We'll just swap the pointers.
@ -473,8 +596,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;
-  dec->coeffs_ = (int16_t*)mem;
+  dec->mb_data_ = (VP8MBData*)mem;
-  mem += coeffs_size;
+  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
  if (dec->mt_method_ == 2) {
    dec->thread_ctx_.mb_data_ += mb_w;
  }
  mem += mb_data_size;
  dec->cache_y_stride_ = 16 * mb_w;
  dec->cache_uv_stride_ = 8 * mb_w;
@ -496,8 +623,9 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += alpha_size;
  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
-  // note: left-info is initialized once for all.
+  // note: left/top-info is initialized once for all.
  memset(dec->mb_info_ - 1, 0, mb_info_size);
  VP8InitScanline(dec);   // initialize left too.
  // initialize top
  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@ -534,30 +662,64 @@ static const int kScan[16] = {
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };
-static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
+static int CheckMode(int mb_x, int mb_y, int mode) {
  if (mode == B_DC_PRED) {
-    if (dec->mb_x_ == 0) {
+    if (mb_x == 0) {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
    } else {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
    }
  }
  return mode;
 }
-static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
+static void Copy32b(uint8_t* dst, uint8_t* src) {
-  *(uint32_t*)dst = *(uint32_t*)src;
+  memcpy(dst, src, 4);
 }
-void VP8ReconstructBlock(VP8Decoder* const dec) {
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
                                    uint8_t* const dst) {
  switch (bits >> 30) {
    case 3:
      VP8Transform(src, dst, 0);
      break;
    case 2:
      VP8TransformAC3(src, dst);
      break;
    case 1:
      VP8TransformDC(src, dst);
      break;
    default:
      break;
  }
 }
 static void DoUVTransform(uint32_t bits, const int16_t* const src,
                          uint8_t* const dst) {
  if (bits & 0xff) {    // any non-zero coeff at all?
    if (bits & 0xaa) {  // any non-zero AC coefficient?
      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
    } else {
      VP8TransformDCUV(src, dst);
    }
  }
 }
 static void ReconstructRow(const VP8Decoder* const dec,
                           const VP8ThreadContext* ctx) {
  int j;
  int mb_x;
  const int mb_y = ctx->mb_y_;
  const int cache_id = ctx->id_;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
    const VP8MBData* const block = ctx->mb_data_ + mb_x;
    // Rotate in the left samples from previously decoded block. We move four
    // pixels at a time for alignment reason, and because of in-loop filter.
-  if (dec->mb_x_ > 0) {
+    if (mb_x > 0) {
      for (j = -1; j < 16; ++j) {
        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
      }
@ -574,23 +736,22 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
        v_dst[j * BPS - 1] = 129;
      }
      // Init top-left sample on left column too
-    if (dec->mb_y_ > 0) {
+      if (mb_y > 0) {
        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
      }
    }
    {
      // bring top samples into the cache
-    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
-    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
+      const int16_t* const coeffs = block->coeffs_;
-    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
+      uint32_t bits = block->non_zero_y_;
    const int16_t* coeffs = dec->coeffs_;
      int n;
-    if (dec->mb_y_ > 0) {
+      if (mb_y > 0) {
-      memcpy(y_dst - BPS, top_y, 16);
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
-      memcpy(u_dst - BPS, top_u, 8);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
-      memcpy(v_dst - BPS, top_v, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
-    } else if (dec->mb_x_ == 0) {
+      } else if (mb_x == 0) {
        // we only need to do this init once at block (0,0).
        // Afterward, it remains valid for the whole topmost row.
        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
@ -599,82 +760,59 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
      }
      // predict and add residuals
-
+      if (block->is_i4x4_) {   // 4x4
    if (dec->is_i4x4_) {   // 4x4
        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-      if (dec->mb_y_ > 0) {
+        if (mb_y > 0) {
-        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
-          top_right[0] = top_y[15] * 0x01010101u;
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
          } else {
-          memcpy(top_right, top_y + 16, sizeof(*top_right));
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
          }
        }
        // replicate the top-right pixels below
        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-      // predict and add residues for all 4x4 blocks in turn.
+        // predict and add residuals for all 4x4 blocks in turn.
-      for (n = 0; n < 16; n++) {
+        for (n = 0; n < 16; ++n, bits <<= 2) {
          uint8_t* const dst = y_dst + kScan[n];
-        VP8PredLuma4[dec->imodes_[n]](dst);
+          VP8PredLuma4[block->imodes_[n]](dst);
-        if (dec->non_zero_ac_ & (1 << n)) {
+          DoTransform(bits, coeffs + n * 16, dst);
          VP8Transform(coeffs + n * 16, dst, 0);
        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
          VP8TransformDC(coeffs + n * 16, dst);
        }
        }
      } else {    // 16x16
-      const int pred_func = CheckMode(dec, dec->imodes_[0]);
+        const int pred_func = CheckMode(mb_x, mb_y,
                                        block->imodes_[0]);
        VP8PredLuma16[pred_func](y_dst);
-      if (dec->non_zero_) {
+        if (bits != 0) {
-        for (n = 0; n < 16; n++) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
-          uint8_t* const dst = y_dst + kScan[n];
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
          if (dec->non_zero_ac_ & (1 << n)) {
            VP8Transform(coeffs + n * 16, dst, 0);
          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
            VP8TransformDC(coeffs + n * 16, dst);
          }
          }
        }
      }
      {
        // Chroma
-      const int pred_func = CheckMode(dec, dec->uvmode_);
+        const uint32_t bits_uv = block->non_zero_uv_;
        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
        VP8PredChroma8[pred_func](u_dst);
        VP8PredChroma8[pred_func](v_dst);
-
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
-      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
        if (dec->non_zero_ac_ & 0x0f0000) {
          VP8TransformUV(u_coeffs, u_dst);
        } else {
          VP8TransformDCUV(u_coeffs, u_dst);
        }
      }
      if (dec->non_zero_ & 0xf00000) {   // chroma-V
        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
        if (dec->non_zero_ac_ & 0xf00000) {
          VP8TransformUV(v_coeffs, v_dst);
        } else {
          VP8TransformDCUV(v_coeffs, v_dst);
        }
      }
      // stash away top samples for next block
-      if (dec->mb_y_ < dec->mb_h_ - 1) {
+      if (mb_y < dec->mb_h_ - 1) {
-        memcpy(top_y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
-        memcpy(top_u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
-        memcpy(top_v, v_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
      }
      }
    }
    // Transfer reconstructed samples from yuv_b_ cache to final destination.
    {
-    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
-    uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
-    uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
-    uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
      for (j = 0; j < 16; ++j) {
        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
      }
@ -684,9 +822,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
      }
    }
  }
 }
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Incremental decoding
@ -13,14 +15,11 @@
 #include <string.h>
 #include <stdlib.h>
 #include "./alphai.h"
 #include "./webpi.h"
 #include "./vp8i.h"
 #include "../utils/utils.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
 #define CHUNK_SIZE 4096
@ -29,11 +28,13 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Data structures for memory and states
-// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
+// Decoding states. State normally flows as:
 // WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
 // WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
 // If there is any error the decoder goes into state ERROR.
 typedef enum {
-  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
+  STATE_WEBP_HEADER,  // All the data before that of the VP8/VP8L chunk.
-  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_VP8_HEADER,   // The VP8 Frame header (within the VP8 chunk).
  STATE_VP8_PARTS0,
  STATE_VP8_DATA,
  STATE_VP8L_HEADER,
@ -71,32 +72,41 @@ struct WebPIDecoder {
  MemBuffer mem_;          // input memory buffer.
  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
  size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.
  int last_mb_y_;          // last row reached for intra-mode decoding
 };
 // MB context to restore in case VP8DecodeMB() fails
 typedef struct {
  VP8MB left_;
  VP8MB info_;
  uint8_t intra_t_[4];
  uint8_t intra_l_[4];
  VP8BitReader br_;
  VP8BitReader token_br_;
 } MBContext;
 //------------------------------------------------------------------------------
 // MemBuffer: incoming data handling
 static void RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
  if (br->buf_ != NULL) {
    br->buf_ += offset;
    br->buf_end_ += offset;
  }
 }
 static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
  return (mem->end_ - mem->start_);
 }
 // Check if we need to preserve the compressed alpha data, as it may not have
 // been decoded yet.
 static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
  if (idec->state_ == STATE_WEBP_HEADER) {
    // We haven't parsed the headers yet, so we don't know whether the image is
    // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
    return 0;
  }
  if (idec->is_lossless_) {
    return 0;  // ALPH chunk is not present for lossless images.
  } else {
    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
    assert(dec != NULL);  // Must be true as idec->state_ != STATE_WEBP_HEADER.
    return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
  }
 }
 static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
  MemBuffer* const mem = &idec->mem_;
  const uint8_t* const new_base = mem->buf_ + mem->start_;
@ -112,16 +122,32 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
      if (offset != 0) {
        int p;
        for (p = 0; p <= last_part; ++p) {
-          RemapBitReader(dec->parts_ + p, offset);
+          VP8RemapBitReader(dec->parts_ + p, offset);
        }
        // Remap partition #0 data pointer to new offset, but only in MAP
        // mode (in APPEND mode, partition #0 is copied into a fixed memory).
        if (mem->mode_ == MEM_MODE_MAP) {
-          RemapBitReader(&dec->br_, offset);
+          VP8RemapBitReader(&dec->br_, offset);
        }
      }
      assert(last_part >= 0);
      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
      if (NeedCompressedAlpha(idec)) {
        ALPHDecoder* const alph_dec = dec->alph_dec_;
        dec->alpha_data_ += offset;
        if (alph_dec != NULL) {
          if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
            VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
            assert(alph_vp8l_dec != NULL);
            assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
            VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
                                   dec->alpha_data_ + ALPHA_HEADER_LEN,
                                   dec->alpha_data_size_ - ALPHA_HEADER_LEN);
          } else {  // alph_dec->method_ == ALPHA_NO_COMPRESSION
            // Nothing special to do in this case.
          }
        }
      }
    } else {    // Resize lossless bitreader
      VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
      VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
@ -133,8 +159,12 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
 // size if required and also updates VP8BitReader's if new memory is allocated.
 static int AppendToMemBuffer(WebPIDecoder* const idec,
                             const uint8_t* const data, size_t data_size) {
  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  const int need_compressed_alpha = NeedCompressedAlpha(idec);
  const uint8_t* const old_start = mem->buf_ + mem->start_;
  const uint8_t* const old_base =
      need_compressed_alpha ? dec->alpha_data_ : old_start;
  assert(mem->mode_ == MEM_MODE_APPEND);
  if (data_size > MAX_CHUNK_PAYLOAD) {
    // security safeguard: trying to allocate more than what the format
@ -143,17 +173,18 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
  }
  if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
-    const size_t current_size = MemDataSize(mem);
+    const size_t new_mem_start = old_start - old_base;
    const size_t current_size = MemDataSize(mem) + new_mem_start;
    const uint64_t new_size = (uint64_t)current_size + data_size;
    const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
    uint8_t* const new_buf =
        (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
    if (new_buf == NULL) return 0;
    memcpy(new_buf, old_base, current_size);
-    free(mem->buf_);
+    WebPSafeFree(mem->buf_);
    mem->buf_ = new_buf;
    mem->buf_size_ = (size_t)extra_size;
-    mem->start_ = 0;
+    mem->start_ = new_mem_start;
    mem->end_ = current_size;
  }
@ -161,14 +192,15 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
  mem->end_ += data_size;
  assert(mem->end_ <= mem->buf_size_);
-  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
  return 1;
 }
 static int RemapMemBuffer(WebPIDecoder* const idec,
                          const uint8_t* const data, size_t data_size) {
  MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  const uint8_t* const old_buf = mem->buf_;
  const uint8_t* const old_start = old_buf + mem->start_;
  assert(mem->mode_ == MEM_MODE_MAP);
  if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
@ -176,7 +208,7 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
  mem->buf_ = (uint8_t*)data;
  mem->end_ = mem->buf_size_ = data_size;
-  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
  return 1;
 }
@ -191,8 +223,8 @@ static void InitMemBuffer(MemBuffer* const mem) {
 static void ClearMemBuffer(MemBuffer* const mem) {
  assert(mem);
  if (mem->mode_ == MEM_MODE_APPEND) {
-    free(mem->buf_);
+    WebPSafeFree(mem->buf_);
-    free((void*)mem->part0_buf_);
+    WebPSafeFree((void*)mem->part0_buf_);
  }
 }
@ -206,35 +238,36 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
  return 1;
 }
 // To be called last.
 static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
 #if WEBP_DECODER_ABI_VERSION > 0x0203
  const WebPDecoderOptions* const options = idec->params_.options;
  WebPDecBuffer* const output = idec->params_.output;
  idec->state_ = STATE_DONE;
  if (options != NULL && options->flip) {
    return WebPFlipBuffer(output);
  }
 #endif
  idec->state_ = STATE_DONE;
  return VP8_STATUS_OK;
 }
 //------------------------------------------------------------------------------
 // Macroblock-decoding contexts
 static void SaveContext(const VP8Decoder* dec, const VP8BitReader* token_br,
                        MBContext* const context) {
-  const VP8BitReader* const br = &dec->br_;
+  context->left_ = dec->mb_info_[-1];
-  const VP8MB* const left = dec->mb_info_ - 1;
+  context->info_ = dec->mb_info_[dec->mb_x_];
  const VP8MB* const info = dec->mb_info_ + dec->mb_x_;
  context->left_ = *left;
  context->info_ = *info;
  context->br_ = *br;
  context->token_br_ = *token_br;
  memcpy(context->intra_t_, dec->intra_t_ + 4 * dec->mb_x_, 4);
  memcpy(context->intra_l_, dec->intra_l_, 4);
 }
 static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
                           VP8BitReader* const token_br) {
-  VP8BitReader* const br = &dec->br_;
+  dec->mb_info_[-1] = context->left_;
-  VP8MB* const left = dec->mb_info_ - 1;
+  dec->mb_info_[dec->mb_x_] = context->info_;
  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
  *left = context->left_;
  *info = context->info_;
  *br = context->br_;
  *token_br = context->token_br_;
  memcpy(dec->intra_t_ + 4 * dec->mb_x_, context->intra_t_, 4);
  memcpy(dec->intra_l_, context->intra_l_, 4);
 }
 //------------------------------------------------------------------------------
@ -242,7 +275,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
  if (idec->state_ == STATE_VP8_DATA) {
    VP8Io* const io = &idec->io_;
-    if (io->teardown) {
+    if (io->teardown != NULL) {
      io->teardown(io);
    }
  }
@ -270,6 +303,7 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
  headers.data = data;
  headers.data_size = curr_size;
  headers.have_all_data = 0;
  status = WebPParseHeaders(&headers);
  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
    return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
@ -285,15 +319,9 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
    idec->dec_ = dec;
 #ifdef WEBP_USE_THREAD
    dec->use_threads_ = (idec->params_.options != NULL) &&
                        (idec->params_.options->use_threads > 0);
 #else
    dec->use_threads_ = 0;
 #endif
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
-    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+    ChangeState(idec, STATE_VP8_HEADER, headers.offset);
  } else {
    VP8LDecoder* const dec = VP8LNew();
    if (dec == NULL) {
@ -308,13 +336,14 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
  const size_t curr_size = MemDataSize(&idec->mem_);
  int width, height;
  uint32_t bits;
  if (curr_size < VP8_FRAME_HEADER_SIZE) {
    // Not enough data bytes to extract VP8 Frame Header.
    return VP8_STATUS_SUSPENDED;
  }
-  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
  }
@ -328,30 +357,33 @@ static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
 }
 // Partition #0
-static int CopyParts0Data(WebPIDecoder* const idec) {
+static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  VP8BitReader* const br = &dec->br_;
-  const size_t psize = br->buf_end_ - br->buf_;
+  const size_t part_size = br->buf_end_ - br->buf_;
  MemBuffer* const mem = &idec->mem_;
  assert(!idec->is_lossless_);
  assert(mem->part0_buf_ == NULL);
-  assert(psize > 0);
+  // the following is a format limitation, no need for runtime check:
-  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
+  assert(part_size <= mem->part0_size_);
  if (part_size == 0) {   // can't have zero-size partition #0
    return VP8_STATUS_BITSTREAM_ERROR;
  }
  if (mem->mode_ == MEM_MODE_APPEND) {
    // We copy and grab ownership of the partition #0 data.
-    uint8_t* const part0_buf = (uint8_t*)malloc(psize);
+    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, part_size);
    if (part0_buf == NULL) {
-      return 0;
+      return VP8_STATUS_OUT_OF_MEMORY;
    }
-    memcpy(part0_buf, br->buf_, psize);
+    memcpy(part0_buf, br->buf_, part_size);
    mem->part0_buf_ = part0_buf;
    br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + psize;
+    br->buf_end_ = part0_buf + part_size;
  } else {
    // Else: just keep pointers to the partition #0's data in dec_->br_.
  }
-  mem->start_ += psize;
+  mem->start_ += part_size;
-  return 1;
+  return VP8_STATUS_OK;
 }
 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
@ -381,9 +413,14 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  if (dec->status_ != VP8_STATUS_OK) {
    return IDecError(idec, dec->status_);
  }
  // This change must be done before calling VP8InitFrame()
  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
                                       io->width, io->height);
  VP8InitDithering(params->options, dec);
-  if (!CopyParts0Data(idec)) {
+  dec->status_ = CopyParts0Data(idec);
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
+  if (dec->status_ != VP8_STATUS_OK) {
    return IDecError(idec, dec->status_);
  }
  // Finish setting up the decoding parameters. Will call io->setup().
@ -407,49 +444,52 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  VP8Io* const io = &idec->io_;
  assert(dec->ready_);
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
-    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+    if (idec->last_mb_y_ != dec->mb_y_) {
-    if (dec->mb_x_ == 0) {
+      if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
-      VP8InitScanline(dec);
+        // note: normally, error shouldn't occur since we already have the whole
        // partition0 available here in DecodeRemaining(). Reaching EOF while
        // reading intra modes really means a BITSTREAM_ERROR.
        return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
      }
-    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+      idec->last_mb_y_ = dec->mb_y_;
    }
    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      VP8BitReader* const token_br =
          &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
      MBContext context;
      SaveContext(dec, token_br, &context);
      if (!VP8DecodeMB(dec, token_br)) {
        RestoreContext(&context, dec, token_br);
        // We shouldn't fail when MAX_MB data was available
        if (dec->num_parts_ == 1 && MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
          return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
        }
        RestoreContext(&context, dec, token_br);
        return VP8_STATUS_SUSPENDED;
      }
      // Reconstruct and emit samples.
      VP8ReconstructBlock(dec);
      // Release buffer only if there is only one partition
      if (dec->num_parts_ == 1) {
        idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
        assert(idec->mem_.start_ <= idec->mem_.end_);
      }
    }
    VP8InitScanline(dec);   // Prepare for next scanline
    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return IDecError(idec, VP8_STATUS_USER_ABORT);
    }
    dec->mb_x_ = 0;
  }
  // Synchronize the thread and check for errors.
  if (!VP8ExitCritical(dec, io)) {
    return IDecError(idec, VP8_STATUS_USER_ABORT);
  }
  dec->ready_ = 0;
-  idec->state_ = STATE_DONE;
+  return FinishDecoding(idec);
  return VP8_STATUS_OK;
 }
-static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
                                         VP8StatusCode status) {
  if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
    return VP8_STATUS_SUSPENDED;
  }
@ -494,26 +534,30 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
  }
  if (!VP8LDecodeImage(dec)) {
    // The decoding is called after all the data-bytes are aggregated. Change
    // the error to VP8_BITSTREAM_ERROR in case lossless decoder fails to decode
    // all the pixels (VP8_STATUS_SUSPENDED).
    if (dec->status_ == VP8_STATUS_SUSPENDED) {
      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
    }
    return ErrorStatusLossless(idec, dec->status_);
  }
-  idec->state_ = STATE_DONE;
+  return FinishDecoding(idec);
  return VP8_STATUS_OK;
 }
  // Main decoding loop
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
  VP8StatusCode status = VP8_STATUS_SUSPENDED;
-  if (idec->state_ == STATE_PRE_VP8) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
    status = DecodeWebPHeaders(idec);
  } else {
    if (idec->dec_ == NULL) {
      return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
    }
  }
-  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+  if (idec->state_ == STATE_VP8_HEADER) {
    status = DecodeVP8FrameHeader(idec);
  }
  if (idec->state_ == STATE_VP8_PARTS0) {
@ -535,20 +579,23 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 // Public functions
 WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
+  WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
  if (idec == NULL) {
    return NULL;
  }
-  idec->state_ = STATE_PRE_VP8;
+  idec->state_ = STATE_WEBP_HEADER;
  idec->chunk_size_ = 0;
  idec->last_mb_y_ = -1;
  InitMemBuffer(&idec->mem_);
  WebPInitDecBuffer(&idec->output_);
  VP8InitIo(&idec->io_);
  WebPResetDecParams(&idec->params_);
-  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  idec->params_.output = (output_buffer != NULL) ? output_buffer
                                                 : &idec->output_;
  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
  return idec;
@ -580,14 +627,18 @@ void WebPIDelete(WebPIDecoder* idec) {
  if (idec == NULL) return;
  if (idec->dec_ != NULL) {
    if (!idec->is_lossless_) {
-      VP8Delete(idec->dec_);
+      if (idec->state_ == STATE_VP8_DATA) {
        // Synchronize the thread, clean-up and check for errors.
        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
      }
      VP8Delete((VP8Decoder*)idec->dec_);
    } else {
-      VP8LDelete(idec->dec_);
+      VP8LDelete((VP8LDecoder*)idec->dec_);
    }
  }
  ClearMemBuffer(&idec->mem_);
  WebPFreeDecBuffer(&idec->output_);
-  free(idec);
+  WebPSafeFree(idec);
 }
 //------------------------------------------------------------------------------
@ -797,7 +848,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                    VP8IoSetupHook setup,
                    VP8IoTeardownHook teardown,
                    void* user_data) {
-  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
+  if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
    return 0;
  }
@ -809,6 +860,3 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
  return 1;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // functions for sample output.
@ -15,10 +17,7 @@
 #include "./webpi.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
-
+#include "../utils/utils.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@ -46,27 +45,13 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 // Point-sampling U/V sampler.
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
-  WebPDecBuffer* output = p->output;
+  WebPDecBuffer* const output = p->output;
-  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
-  const uint8_t* y_src = io->y;
+  WebPSamplerProcessPlane(io->y, io->y_stride,
-  const uint8_t* u_src = io->u;
+                          io->u, io->v, io->uv_stride,
-  const uint8_t* v_src = io->v;
+                          dst, buf->stride, io->mb_w, io->mb_h,
-  const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
+                          WebPSamplers[output->colorspace]);
  const int mb_w = io->mb_w;
  const int last = io->mb_h - 1;
  int j;
  for (j = 0; j < last; j += 2) {
    sample(y_src, y_src + io->y_stride, u_src, v_src,
           dst, dst + buf->stride, mb_w);
    y_src += 2 * io->y_stride;
    u_src += io->uv_stride;
    v_src += io->uv_stride;
    dst += 2 * buf->stride;
  }
  if (j == last) {  // Just do the last line twice
    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
  }
  return io->mb_h;
 }
@ -117,7 +102,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
  if (y == 0) {
    // First line is special cased. We mirror the u/v samples at boundary.
-    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
  } else {
    // We can finish the left-over line from previous call.
    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
@ -252,7 +237,11 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
    int num_rows;
    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
 #ifdef WEBP_SWAP_16BIT_CSP
    uint8_t* alpha_dst = base_rgba;
 #else
    uint8_t* alpha_dst = base_rgba + 1;
 #endif
    uint32_t alpha_mask = 0x0f;
    int i, j;
@ -291,7 +280,17 @@ static int Rescale(const uint8_t* src, int src_stride,
 static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
  const int mb_h = io->mb_h;
  const int uv_mb_h = (mb_h + 1) >> 1;
-  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  WebPRescaler* const scaler = &p->scaler_y;
  int num_lines_out = 0;
  if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
    // Before rescaling, we premultiply the luma directly into the io->y
    // internal buffer. This is OK since these samples are not used for
    // intra-prediction (the top samples are saved in cache_y_/u_/v_).
    // But we need to cast the const away, though.
    WebPMultRows((uint8_t*)io->y, io->y_stride,
                 io->a, io->width, io->mb_w, mb_h, 0);
  }
  num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
  return num_lines_out;
@ -299,7 +298,14 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
  if (io->a != NULL) {
-    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
    if (num_lines_out > 0) {   // unmultiply the Y
      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
                   p->scaler_a.dst_width, num_lines_out, 1);
    }
  }
  return 0;
 }
@ -318,11 +324,11 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
  size_t tmp_size;
  int32_t* work;
-  tmp_size = work_size + 2 * uv_work_size;
+  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
  if (has_alpha) {
-    tmp_size += work_size;
+    tmp_size += work_size * sizeof(*work);
  }
-  p->memory = calloc(1, tmp_size * sizeof(*work));
+  p->memory = WebPSafeCalloc(1ULL, tmp_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
@ -349,6 +355,7 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
                     io->mb_w, out_width, io->mb_h, out_height,
                     work + work_size + 2 * uv_work_size);
    p->emit_alpha = EmitRescaledAlphaYUV;
    WebPInitAlphaProcessing();
  }
  return 1;
 }
@ -368,9 +375,9 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
         WebPRescalerHasPendingOutput(&p->scaler_u)) {
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_y, 0);
-    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_u, 0);
-    WebPRescalerExportRow(&p->scaler_v);
+    WebPRescalerExportRow(&p->scaler_v, 0);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
@ -418,7 +425,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
+    WebPRescalerExportRow(&p->scaler_a, 0);
    for (i = 0; i < width; ++i) {
      const uint32_t alpha_value = p->scaler_a.dst[i];
      dst[4 * i] = alpha_value;
@ -437,7 +444,11 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
 #ifdef WEBP_SWAP_16BIT_CSP
  uint8_t* alpha_dst = base_rgba;
 #else
  uint8_t* alpha_dst = base_rgba + 1;
 #endif
  int num_lines_out = 0;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int width = p->scaler_a.dst_width;
@ -447,7 +458,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
+    WebPRescalerExportRow(&p->scaler_a, 0);
    for (i = 0; i < width; ++i) {
      // Fill in the alpha value (converted to 4 bits).
      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@ -486,7 +497,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
  int32_t* work;  // rescalers work area
  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2;
+  size_t tmp_size1, tmp_size2, total_size;
  tmp_size1 = 3 * work_size;
  tmp_size2 = 3 * out_width;
@ -494,7 +505,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size1 += work_size;
    tmp_size2 += out_width;
  }
-  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
  p->memory = WebPSafeCalloc(1ULL, total_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
@ -526,6 +538,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    } else {
      p->emit_alpha_row = ExportAlpha;
    }
    WebPInitAlphaProcessing();
  }
  return 1;
 }
@ -546,7 +559,9 @@ static int CustomSetup(VP8Io* io) {
  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
    return 0;
  }
-
+  if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
    WebPInitUpsamplers();
  }
  if (io->use_scaling) {
    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
    if (!ok) {
@ -555,10 +570,10 @@ static int CustomSetup(VP8Io* io) {
  } else {
    if (is_rgb) {
      p->emit = EmitSampledRGB;   // default
 #ifdef FANCY_UPSAMPLING
      if (io->fancy_upsampling) {
 #ifdef FANCY_UPSAMPLING
        const int uv_width = (io->mb_w + 1) >> 1;
-        p->memory = malloc(io->mb_w + 2 * uv_width);
+        p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
        if (p->memory == NULL) {
          return 0;   // memory error.
        }
@ -567,18 +582,22 @@ static int CustomSetup(VP8Io* io) {
        p->tmp_v = p->tmp_u + uv_width;
        p->emit = EmitFancyRGB;
        WebPInitUpsamplers();
      }
 #endif
      } else {
        WebPInitSamplers();
      }
    } else {
      p->emit = EmitYUV;
    }
    if (is_alpha) {  // need transparency output
      if (WebPIsPremultipliedMode(colorspace)) WebPInitPremultiply();
      p->emit_alpha =
          (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
              EmitAlphaRGBA4444
          : is_rgb ? EmitAlphaRGB
          : EmitAlphaYUV;
      if (is_rgb) {
        WebPInitAlphaProcessing();
      }
    }
  }
@ -601,7 +620,7 @@ static int CustomPut(const VP8Io* io) {
    return 0;
  }
  num_lines_out = p->emit(io, p);
-  if (p->emit_alpha) {
+  if (p->emit_alpha != NULL) {
    p->emit_alpha(io, p);
  }
  p->last_y += num_lines_out;
@ -612,7 +631,7 @@ static int CustomPut(const VP8Io* io) {
 static void CustomTeardown(const VP8Io* io) {
  WebPDecParams* const p = (WebPDecParams*)io->opaque;
-  free(p->memory);
+  WebPSafeFree(p->memory);
  p->memory = NULL;
 }
@ -627,7 +646,3 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
 }
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/layer.c
+++ b/src/dec/layer.c
@ -1,35 +0,0 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <assert.h>
 #include <stdlib.h>
 #include "./vp8i.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 int VP8DecodeLayer(VP8Decoder* const dec) {
  assert(dec);
  assert(dec->layer_data_size_ > 0);
  (void)dec;
  // TODO: handle enhancement layer here.
  return 1;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
@ -11,10 +13,6 @@
 #include "./vp8i.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
 }
@ -102,12 +100,11 @@ void VP8ParseQuant(VP8Decoder* const dec) {
      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
    }
  }
 }
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -1,22 +1,21 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
 //
 // Author: Skal (pascal.massimino@gmail.com)
-#include "vp8i.h"
+#include "./vp8i.h"
 #include "../utils/bit_reader_inl.h"
 #define USE_GENERIC_TREE
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
@ -31,61 +30,12 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif
 #ifndef ONLY_KEYFRAME_CODE
 // inter prediction modes
 enum {
  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
 static const int8_t kYModesInter[8] = {
  -DC_PRED, 1,
    2, 3,
      -V_PRED, -H_PRED,
      -TM_PRED, -B_PRED
 };
 static const int8_t kMBSplit[6] = {
  -3, 1,
    -2, 2,
      -0, -1
 };
 static const int8_t kMVRef[8] = {
  -ZEROMV, 1,
    -NEARESTMV, 2,
      -NEARMV, 3,
        -NEWMV, -SPLITMV
 };
 static const int8_t kMVRef4[6] = {
  -LEFT4, 1,
    -ABOVE4, 2,
      -ZERO4, -NEW4
 };
 #endif
 //------------------------------------------------------------------------------
 // Default probabilities
 // Inter
 #ifndef ONLY_KEYFRAME_CODE
 static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
 static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
 static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
  { 162, 128, 225, 146, 172, 147, 214,  39,
    156, 128, 129, 132,  75, 145, 178, 206,
    239, 254, 254 },
  { 164, 128, 204, 170, 119, 235, 140, 230,
    228, 128, 130, 130,  74, 148, 180, 203,
    236, 254, 254 }
 };
 #endif
 // Paragraph 13.5
 static const uint8_t
  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
  // genereated using vp8_default_coef_probs() in entropy.c:129
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@ -326,28 +276,38 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
 void VP8ResetProba(VP8Proba* const proba) {
  memset(proba->segments_, 255u, sizeof(proba->segments_));
-  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
+  // proba->bands_[][] is initialized later
 #ifndef ONLY_KEYFRAME_CODE
  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
 #endif
 }
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
+static void ParseIntraMode(VP8BitReader* const br,
-  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
+                           VP8Decoder* const dec, int mb_x) {
  uint8_t* const top = dec->intra_t_ + 4 * mb_x;
  uint8_t* const left = dec->intra_l_;
  VP8MBData* const block = dec->mb_data_ + mb_x;
  // Note: we don't save segment map (yet), as we don't expect
  // to decode more than 1 keyframe.
  if (dec->segment_hdr_.update_map_) {
    // Hardcoded tree parsing
    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
                    ? VP8GetBit(br, dec->proba_.segments_[1])
                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
  } else {
    block->segment_ = 0;  // default for intra
  }
  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
  if (!block->is_i4x4_) {
    // Hardcoded 16x16 intra-mode decision tree.
  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
  if (!dec->is_i4x4_) {
    const int ymode =
        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    dec->imodes_[0] = ymode;
+    block->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(top[0]));
+    memset(top, ymode, 4 * sizeof(*top));
-    memset(left, ymode, 4 * sizeof(left[0]));
+    memset(left, ymode, 4 * sizeof(*left));
  } else {
-    uint8_t* modes = dec->imodes_;
+    uint8_t* modes = block->imodes_;
    int y;
    for (y = 0; y < 4; ++y) {
      int ymode = left[y];
@ -356,10 +316,10 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
        // Generic tree-parsing
-        int i = 0;
+        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
-        do {
+        while (i > 0) {
          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        } while (i > 0);
+        }
        ymode = -i;
 #else
        // Hardcoded tree parsing
@ -374,17 +334,26 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
        top[x] = ymode;
        *modes++ = ymode;
      }
      memcpy(modes, top, 4 * sizeof(*top));
      modes += 4;
      left[y] = ymode;
    }
  }
  // Hardcoded UVMode decision tree
-  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
                 : !VP8GetBit(br, 114) ? V_PRED
                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
 int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
  int mb_x;
  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
    ParseIntraMode(br, dec, mb_x);
  }
  return !dec->br_.eof_;
 }
 //------------------------------------------------------------------------------
 // Paragraph 13
@ -524,17 +493,6 @@ static const uint8_t
  }
 };
 #ifndef ONLY_KEYFRAME_CODE
 static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
  { 237, 246, 253, 253, 254, 254, 254, 254,
    254, 254, 254, 254, 254, 254, 250, 250,
    252, 254, 254 },
  { 231, 243, 245, 253, 254, 254, 254, 254,
    254, 254, 254, 254, 254, 254, 251, 251,
    254, 254, 254 }
 };
 #endif
 // Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
@ -543,9 +501,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
    for (b = 0; b < NUM_BANDS; ++b) {
      for (c = 0; c < NUM_CTX; ++c) {
        for (p = 0; p < NUM_PROBAS; ++p) {
-          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
+          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
-            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
+                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
-          }
+          proba->bands_[t][b].probas_[c][p] = v;
        }
      }
    }
@ -554,36 +512,5 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  if (dec->use_skip_proba_) {
    dec->skip_p_ = VP8GetValue(br, 8);
  }
 #ifndef ONLY_KEYFRAME_CODE
  if (!dec->frm_hdr_.key_frame_) {
    int i;
    dec->intra_p_ = VP8GetValue(br, 8);
    dec->last_p_ = VP8GetValue(br, 8);
    dec->golden_p_ = VP8GetValue(br, 8);
    if (VP8Get(br)) {   // update y-mode
      for (i = 0; i < 4; ++i) {
        proba->ymode_[i] = VP8GetValue(br, 8);
      }
    }
    if (VP8Get(br)) {   // update uv-mode
      for (i = 0; i < 3; ++i) {
        proba->uvmode_[i] = VP8GetValue(br, 8);
      }
    }
    // update MV
    for (i = 0; i < 2; ++i) {
      int k;
      for (k = 0; k < NUM_MV_PROBAS; ++k) {
        if (VP8GetBit(br, MVUpdateProba[i][k])) {
          const int v = VP8GetValue(br, 7);
          proba->mv_[i][k] = v ? v << 1 : 1;
        }
      }
    }
  }
 #endif
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@ -11,14 +13,12 @@
 #include <stdlib.h>
 #include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
-#include "../utils/bit_reader.h"
+#include "../utils/bit_reader_inl.h"
-
+#include "../utils/utils.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
@ -45,10 +45,10 @@ int VP8InitIoInternal(VP8Io* const io, int version) {
 }
 VP8Decoder* VP8New(void) {
-  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
+  VP8Decoder* const dec = (VP8Decoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
  if (dec != NULL) {
    SetOk(dec);
-    WebPWorkerInit(&dec->worker_);
+    WebPGetWorkerInterface()->Init(&dec->worker_);
    dec->ready_ = 0;
    dec->num_parts_ = 1;
  }
@ -69,7 +69,7 @@ const char* VP8StatusMessage(VP8Decoder* const dec) {
 void VP8Delete(VP8Decoder* const dec) {
  if (dec != NULL) {
    VP8Clear(dec);
-    free(dec);
+    WebPSafeFree(dec);
  }
 }
@ -121,6 +121,9 @@ int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
    if (((bits >> 5)) >= chunk_size) {  // partition_length
      return 0;         // inconsistent size information.
    }
    if (w == 0 || h == 0) {
      return 0;         // We don't support both width and height to be zero.
    }
    if (width) {
      *width = w;
@ -247,7 +250,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
  VP8StatusCode status;
  WebPHeaderStructure headers;
  if (dec == NULL) {
    return 0;
@ -257,33 +259,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                       "null VP8Io passed to VP8GetHeaders()");
  }
-
+  buf = io->data;
-  // Process Pre-VP8 chunks.
+  buf_size = io->data_size;
  headers.data = io->data;
  headers.data_size = io->data_size;
  status = WebPParseHeaders(&headers);
  if (status != VP8_STATUS_OK) {
    return VP8SetError(dec, status, "Incorrect/incomplete header.");
  }
  if (headers.is_lossless) {
    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                       "Unexpected lossless format encountered.");
  }
  if (dec->alpha_data_ == NULL) {
    assert(dec->alpha_data_size_ == 0);
    // We have NOT set alpha data yet. Set it now.
    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
    // WebPParseHeaders() is called more than once, as in incremental decoding
    // case.)
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
  }
  // Process the VP8 frame header.
  buf = headers.data + headers.offset;
  buf_size = headers.data_size - headers.offset;
  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
  if (buf_size < 4) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "Truncated header.");
@ -341,7 +318,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    VP8ResetProba(&dec->proba_);
    ResetSegmentHeader(&dec->segment_hdr_);
    dec->segment_ = 0;    // default for intra
  }
  // Check if we have all the partition #0 available, and initialize dec->br_
@ -379,63 +355,14 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  // Frame buffer marking
  if (!frm_hdr->key_frame_) {
    // Paragraph 9.7
 #ifndef ONLY_KEYFRAME_CODE
    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
    if (!(dec->buffer_flags_ & 1)) {
      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
    }
    if (!(dec->buffer_flags_ & 2)) {
      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
    }
    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
 #else
    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                       "Not a key frame.");
 #endif
  } else {
    dec->buffer_flags_ = 0x003 | 0x100;
  }
-  // Paragraph 9.8
+  VP8Get(br);   // ignore the value of update_proba_
 #ifndef ONLY_KEYFRAME_CODE
  dec->update_proba_ = VP8Get(br);
  if (!dec->update_proba_) {    // save for later restore
    dec->proba_saved_ = dec->proba_;
  }
  dec->buffer_flags_ &= 1 << 8;
  dec->buffer_flags_ |=
      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
 #else
  VP8Get(br);   // just ignore the value of update_proba_
 #endif
  VP8ParseProba(br, dec);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
  // Extensions
  if (dec->pic_hdr_.colorspace_) {
    const size_t kTrailerSize = 8;
    const uint8_t kTrailerMarker = 0x01;
    const uint8_t* ext_buf = buf - kTrailerSize;
    size_t size;
    if (frm_hdr->partition_length_ < kTrailerSize ||
        ext_buf[kTrailerSize - 1] != kTrailerMarker) {
      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                         "RIFF: Inconsistent extra information.");
    }
    // Layer
    size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
    dec->layer_data_size_ = size;
    dec->layer_data_ = NULL;  // will be set later
    dec->layer_colorspace_ = ext_buf[3];
  }
 #endif
  // sanitized state
  dec->ready_ = 1;
  return 1;
@ -459,9 +386,6 @@ static const uint8_t kZigzag[16] = {
  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
 typedef const uint8_t (*ProbaCtxArray)[NUM_PROBAS];
 // See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
  int v;
@ -495,19 +419,20 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }
 // Returns the position of the last non-zero coeff plus one
-// (and 0 if there's no coeff at all)
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
 static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
                     int ctx, const quant_t dq, int n, int16_t* out) {
  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n][ctx];
+  const uint8_t* p = prob[n].probas_[ctx];
  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
    return 0;
  }
  for (; n < 16; ++n) {
-    const ProbaCtxArray p_ctx = prob[kBands[n + 1]];
+    if (!VP8GetBit(br, p[0])) {
-    if (!VP8GetBit(br, p[1])) {
+      return n;  // previous coeff was last non-zero coeff
-      p = p_ctx[0];
+    }
-    } else {  // non zero coeff
+    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
      p = prob[kBands[++n]].probas_[0];
      if (n == 16) return 16;
    }
    {        // non zero coeff
      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
      int v;
      if (!VP8GetBit(br, p[2])) {
        v = 1;
@ -517,205 +442,172 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
        p = p_ctx[2];
      }
      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
      if (n < 15 && !VP8GetBit(br, p[0])) {   // EOB
        return n + 1;
      }
    }
  }
  return 16;
 }
-// Alias-safe way of converting 4bytes to 32bits.
+static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
-typedef union {
+  nz_coeffs <<= 2;
-  uint8_t  i8[4];
+  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
-  uint32_t i32;
+  return nz_coeffs;
-} PackedNz;
+}
-// Table to unpack four bits into four bytes
+static int ParseResiduals(VP8Decoder* const dec,
 static const PackedNz kUnpackTab[16] = {
  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
 // Macro to pack four LSB of four bytes into four bits.
 #if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
    defined(__BIG_ENDIAN__)
 #define PACK_CST 0x08040201U
 #else
 #define PACK_CST 0x01020408U
 #endif
 #define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
 static void ParseResiduals(VP8Decoder* const dec,
                          VP8MB* const mb, VP8BitReader* const token_br) {
-  int out_t_nz, out_l_nz, first;
+  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
-  ProbaArray ac_prob;
+  const VP8BandProbas* ac_proba;
-  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-  int16_t* dst = dec->coeffs_;
+  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
  int16_t* dst = block->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
-  PackedNz nz_ac, nz_dc;
+  uint8_t tnz, lnz;
-  PackedNz tnz, lnz;
+  uint32_t non_zero_y = 0;
-  uint32_t non_zero_ac = 0;
+  uint32_t non_zero_uv = 0;
  uint32_t non_zero_dc = 0;
  int x, y, ch;
  uint32_t out_t_nz, out_l_nz;
  int first;
  nz_dc.i32 = nz_ac.i32 = 0;
  memset(dst, 0, 384 * sizeof(*dst));
-  if (!dec->is_i4x4_) {    // parse DC
+  if (!block->is_i4x4_) {    // parse DC
    int16_t dc[16] = { 0 };
-    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
+    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
-    mb->dc_nz_ = left_mb->dc_nz_ =
+    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
-        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
+    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
-                   ctx, q->y2_mat_, 0, dc) > 0);
+    if (nz > 1) {   // more than just the DC -> perform the full transform
    first = 1;
    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
      VP8TransformWHT(dc, dst);
    } else {        // only DC is non-zero -> inlined simplified transform
      int i;
      const int dc0 = (dc[0] + 3) >> 3;
      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
    }
    first = 1;
    ac_proba = bands[0];
  } else {
    first = 0;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+    ac_proba = bands[3];
  }
-  tnz = kUnpackTab[mb->nz_ & 0xf];
+  tnz = mb->nz_ & 0x0f;
-  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  lnz = left_mb->nz_ & 0x0f;
  for (y = 0; y < 4; ++y) {
-    int l = lnz.i8[y];
+    int l = lnz & 1;
    uint32_t nz_coeffs = 0;
    for (x = 0; x < 4; ++x) {
-      const int ctx = l + tnz.i8[x];
+      const int ctx = l + (tnz & 1);
-      const int nz = GetCoeffs(token_br, ac_prob, ctx,
+      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
-                               q->y1_mat_, first, dst);
+      l = (nz > first);
-      tnz.i8[x] = l = (nz > 0);
+      tnz = (tnz >> 1) | (l << 7);
-      nz_dc.i8[x] = (dst[0] != 0);
+      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
      nz_ac.i8[x] = (nz > 1);
      dst += 16;
    }
-    lnz.i8[y] = l;
+    tnz >>= 4;
-    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
+    lnz = (lnz >> 1) | (l << 7);
-    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
  }
-  out_t_nz = PACK(tnz, 24);
+  out_t_nz = tnz;
-  out_l_nz = PACK(lnz, 24);
+  out_l_nz = lnz >> 4;
  tnz = kUnpackTab[mb->nz_ >> 4];
  lnz = kUnpackTab[left_mb->nz_ >> 4];
  for (ch = 0; ch < 4; ch += 2) {
    uint32_t nz_coeffs = 0;
    tnz = mb->nz_ >> (4 + ch);
    lnz = left_mb->nz_ >> (4 + ch);
    for (y = 0; y < 2; ++y) {
-      int l = lnz.i8[ch + y];
+      int l = lnz & 1;
      for (x = 0; x < 2; ++x) {
-        const int ctx = l + tnz.i8[ch + x];
+        const int ctx = l + (tnz & 1);
-        const int nz =
+        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
-            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
+        l = (nz > 0);
-                      ctx, q->uv_mat_, 0, dst);
+        tnz = (tnz >> 1) | (l << 3);
-        tnz.i8[ch + x] = l = (nz > 0);
+        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
        nz_ac.i8[y * 2 + x] = (nz > 1);
        dst += 16;
      }
-      lnz.i8[ch + y] = l;
+      tnz >>= 2;
      lnz = (lnz >> 1) | (l << 5);
    }
-    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
+    // Note: we don't really need the per-4x4 details for U/V blocks.
-    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+    non_zero_uv |= nz_coeffs << (4 * ch);
    out_t_nz |= (tnz << 4) << ch;
    out_l_nz |= (lnz & 0xf0) << ch;
  }
  out_t_nz |= PACK(tnz, 20);
  out_l_nz |= PACK(lnz, 20);
  mb->nz_ = out_t_nz;
  left_mb->nz_ = out_l_nz;
-  dec->non_zero_ac_ = non_zero_ac;
+  block->non_zero_y_ = non_zero_y;
-  dec->non_zero_ = non_zero_ac | non_zero_dc;
+  block->non_zero_uv_ = non_zero_uv;
-  mb->skip_ = !dec->non_zero_;
+
  // We look at the mode-code of each block and check if some blocks have less
  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
  // empty blocks.
  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }
 #undef PACK
 //------------------------------------------------------------------------------
 // Main loop
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
  VP8BitReader* const br = &dec->br_;
  VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  int skip = dec->use_skip_proba_ ? block->skip_ : 0;
-  // Note: we don't save segment map (yet), as we don't expect
+  if (!skip) {
-  // to decode more than 1 keyframe.
+    skip = ParseResiduals(dec, mb, token_br);
  if (dec->segment_hdr_.update_map_) {
    // Hardcoded tree parsing
    dec->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) ?
        VP8GetBit(br, dec->proba_.segments_[1]) :
        2 + VP8GetBit(br, dec->proba_.segments_[2]);
  }
  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
  VP8ParseIntraMode(br, dec);
  if (br->eof_) {
    return 0;
  }
  if (!info->skip_) {
    ParseResiduals(dec, info, token_br);
  } else {
-    left->nz_ = info->nz_ = 0;
+    left->nz_ = mb->nz_ = 0;
-    if (!dec->is_i4x4_) {
+    if (!block->is_i4x4_) {
-      left->dc_nz_ = info->dc_nz_ = 0;
+      left->nz_dc_ = mb->nz_dc_ = 0;
    }
-    dec->non_zero_ = 0;
+    block->non_zero_y_ = 0;
-    dec->non_zero_ac_ = 0;
+    block->non_zero_uv_ = 0;
    block->dither_ = 0;
  }
  if (dec->filter_type_ > 0) {  // store filter info
    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
-    *finfo = dec->fstrengths_[dec->segment_][dec->is_i4x4_];
+    *finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
-    finfo->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+    finfo->f_inner_ |= !skip;
  }
-  return (!token_br->eof_);
+  return !token_br->eof_;
 }
 void VP8InitScanline(VP8Decoder* const dec) {
  VP8MB* const left = dec->mb_info_ - 1;
  left->nz_ = 0;
-  left->dc_nz_ = 0;
+  left->nz_dc_ = 0;
  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->filter_row_ =
+  dec->mb_x_ = 0;
    (dec->filter_type_ > 0) &&
    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
 }
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
    // Parse bitstream for this row.
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    VP8InitScanline(dec);
+    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
-    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                         "Premature end-of-partition0 encountered.");
    }
    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
      // Reconstruct and emit samples.
      VP8ReconstructBlock(dec);
    }
    VP8InitScanline(dec);   // Prepare for next scanline
    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
  }
-  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
+  if (dec->mt_method_ > 0) {
-    return 0;
+    if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) return 0;
  }
  // Finish
 #ifndef ONLY_KEYFRAME_CODE
  if (!dec->update_proba_) {
    dec->proba_ = dec->proba_saved_;
  }
 #endif
 #ifdef WEBP_EXPERIMENTAL_FEATURES
  if (dec->layer_data_size_ > 0) {
    if (!VP8DecodeLayer(dec)) {
      return 0;
    }
  }
 #endif
  return 1;
 }
@ -763,12 +655,10 @@ void VP8Clear(VP8Decoder* const dec) {
  if (dec == NULL) {
    return;
  }
-  if (dec->use_threads_) {
+  WebPGetWorkerInterface()->End(&dec->worker_);
-    WebPWorkerEnd(&dec->worker_);
+  ALPHDelete(dec->alph_dec_);
-  }
+  dec->alph_dec_ = NULL;
-  if (dec->mem_) {
+  WebPSafeFree(dec->mem_);
    free(dec->mem_);
  }
  dec->mem_ = NULL;
  dec->mem_size_ = 0;
  memset(&dec->br_, 0, sizeof(dec->br_));
@ -777,6 +667,3 @@ void VP8Clear(VP8Decoder* const dec) {
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
@ -15,10 +17,11 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
 #include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -27,10 +30,8 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 3
+#define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 0
+#define DEC_REV_VERSION 3
 #define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@ -98,6 +99,9 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
 // minimal width under which lossy multi-threading is always disabled
 #define MIN_WIDTH_FOR_THREADS 512
 //------------------------------------------------------------------------------
 // Headers
@ -126,15 +130,19 @@ typedef struct {
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;
 // probas associated to one of the contexts
 typedef uint8_t VP8ProbaArray[NUM_PROBAS];
 typedef struct {   // all the probas associated to one band
  VP8ProbaArray probas_[NUM_CTX];
 } VP8BandProbas;
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
+  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
 #ifndef ONLY_KEYFRAME_CODE
  uint8_t ymode_[4], uvmode_[3];
  uint8_t mv_[2][NUM_MV_PROBAS];
 #endif
 } VP8Proba;
 // Filter parameters
@ -151,32 +159,61 @@ typedef struct {
 // Informations about the macroblocks.
 typedef struct {  // filter specs
-  unsigned int f_level_:6;      // filter strength: 0..63
+  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
-  unsigned int f_ilevel_:6;     // inner limit: 1..63
+  uint8_t f_ilevel_;     // inner limit in [1..63]
-  unsigned int f_inner_:1;      // do inner filtering?
+  uint8_t f_inner_;      // do inner filtering?
  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
 } VP8FInfo;
-typedef struct {  // used for syntax-parsing
+typedef struct {  // Top/Left Contexts used for syntax-parsing
-  unsigned int nz_:24;       // non-zero AC/DC coeffs (24bit)
+  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
-  unsigned int dc_nz_:1;     // non-zero DC coeffs
+  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
  unsigned int skip_:1;      // block type
 } VP8MB;
 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
  quant_t y1_mat_, y2_mat_, uv_mat_;
  int uv_quant_;   // U/V quantizer value
  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;
 // Data needed to reconstruct a macroblock
 typedef struct {
  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
  uint8_t is_i4x4_;       // true if intra4x4
  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
  uint8_t uvmode_;        // chroma prediction mode
  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
  //   code=0 -> no coefficient
  //   code=1 -> only DC
  //   code=2 -> first three coefficients are non-zero
  //   code=3 -> more than three coefficients are non-zero
  // This allows to call specialized transform functions.
  uint32_t non_zero_y_;
  uint32_t non_zero_uv_;
  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
  uint8_t skip_;
  uint8_t segment_;
 } VP8MBData;
 // Persistent information needed by the parallel processing
 typedef struct {
  int id_;              // cache row to process (in [0..2])
  int mb_y_;            // macroblock position of the row
  int filter_row_;      // true if row-filtering is needed
-  VP8FInfo* f_info_;  // filter strengths
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
  VP8Io io_;            // copy of the VP8Io to pass to put()
 } VP8ThreadContext;
 // Saved top samples, per macroblock. Fits into a cache-line.
 typedef struct {
  uint8_t y[16], u[8], v[8];
 } VP8TopSamples;
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user
@ -196,7 +233,8 @@ struct VP8Decoder {
  // Worker
  WebPWorker worker_;
-  int use_threads_;    // use multi-thread
+  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
                       // 2=[parse][recon+filter]
  int cache_id_;       // current cache row
  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
  VP8ThreadContext thread_ctx_;  // Thread context
@ -213,12 +251,9 @@ struct VP8Decoder {
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];
-  // buffer refresh flags
+  // Dithering strength, deduced from decoding options
-  //   bit 0: refresh Gold, bit 1: refresh Alt
+  int dither_;                // whether to use dithering or not
-  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
+  VP8Random dithering_rg_;    // random generator for dithering
  //   bit 6: Gold sign bias, bit 7: Alt sign bias
  //   bit 8: refresh last frame
  uint32_t buffer_flags_;
  // dequantization (one set of DC/AC dequant factor per segment)
  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@ -227,22 +262,16 @@ struct VP8Decoder {
  VP8Proba proba_;
  int use_skip_proba_;
  uint8_t skip_p_;
 #ifndef ONLY_KEYFRAME_CODE
  uint8_t intra_p_, last_p_, golden_p_;
  VP8Proba proba_saved_;
  int update_proba_;
 #endif
  // Boundary data cache and persistent buffers.
  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
  uint8_t  intra_l_[4];   // left intra modes values
-  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+
-  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  VP8TopSamples* yuv_t_;  // top y/u/v samples
  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
  VP8FInfo* f_info_;      // filter strength info
  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
  uint8_t* cache_u_;
@ -256,31 +285,19 @@ struct VP8Decoder {
  // Per macroblock non-persistent infos.
  int mb_x_, mb_y_;       // current position, in macroblock units
-  uint8_t is_i4x4_;       // true if intra4x4
+  VP8MBData* mb_data_;    // parsed reconstruction data
  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
  uint8_t uvmode_;        // chroma prediction mode
  uint8_t segment_;       // block's segment
  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
  // If the bit is set, the 4x4 block contains some non-zero coefficients.
  uint32_t non_zero_;
  uint32_t non_zero_ac_;
  // Filtering side-info
  int filter_type_;                          // 0=off, 1=simple, 2=complex
  int filter_row_;                           // per-row flag
  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
-  // extensions
+  // Alpha
  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
  const uint8_t* alpha_data_;     // compressed alpha data (if present)
  size_t alpha_data_size_;
  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
-
+  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
  int layer_colorspace_;
  const uint8_t* layer_data_;   // compressed layer data (if present)
  size_t layer_data_size_;
 };
 //------------------------------------------------------------------------------
@ -293,15 +310,14 @@ int VP8SetError(VP8Decoder* const dec,
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec);
+// parses one row of intra mode data in partition 0, returns !eof
 int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 // in quant.c
 void VP8ParseQuant(VP8Decoder* const dec);
 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@ -310,7 +326,15 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Process the last decoded row (filtering + output)
+// Return the multi-threading method to use (0=off), depending
 // on options and bitstream size. Only for lossy decoding.
 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
                       const WebPHeaderStructure* const headers,
                       int width, int height);
 // Initialize dithering post-process if needed.
 void VP8InitDithering(const WebPDecoderOptions* const options,
                      VP8Decoder* const dec);
 // Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
@ -321,12 +345,9 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows);
 // in layer.c
 int VP8DecodeLayer(VP8Decoder* const dec);
 //------------------------------------------------------------------------------
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Lossless decoder: internal header.
@ -18,9 +20,8 @@
 #include "../utils/bit_reader.h"
 #include "../utils/color_cache.h"
 #include "../utils/huffman.h"
 #include "../webp/format_constants.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -39,10 +40,6 @@ struct VP8LTransform {
  uint32_t              *data_;   // transform data.
 };
 typedef struct {
  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
 } HTreeGroup;
 typedef struct {
  int             color_cache_size_;
  VP8LColorCache  color_cache_;
@ -55,7 +52,8 @@ typedef struct {
  HTreeGroup     *htree_groups_;
 } VP8LMetadata;
-typedef struct {
+typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
  VP8StatusCode    status_;
  VP8LDecodeState  action_;
  VP8LDecodeState  state_;
@ -63,7 +61,8 @@ typedef struct {
  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
-  uint32_t        *argb_;          // Internal data: always in BGRA color mode.
+  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
                                   // or uint32_t* for BGRA.
  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
  VP8LBitReader    br_;
@ -71,6 +70,9 @@ typedef struct {
  int              width_;
  int              height_;
  int              last_row_;      // last input row decoded so far.
  int              last_pixel_;    // last pixel decoded so far. However, it may
                                   // not be transformed, scaled and
                                   // color-converted yet.
  int              last_out_row_;  // last row output so far.
  VP8LMetadata     hdr_;
@ -82,18 +84,27 @@ typedef struct {
  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
  WebPRescaler    *rescaler;         // Common rescaler for all channels.
-} VP8LDecoder;
+};
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 struct ALPHDecoder;  // Defined in dec/alphai.h.
 // in vp8l.c
-// Decodes a raw image stream (without header) and store the alpha data
+// Decodes image header for alpha data stored using lossless compression.
-// into *output, which must be of size width x height. Returns false in case
+// Returns false in case of error.
-// of error.
+int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
-int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                          const uint8_t* const data, size_t data_size,
-                               size_t data_size, uint8_t* const output);
+                          uint8_t* const output);
 // Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
 // already decoded in previous call(s), it will resume decoding from where it
 // was paused.
 // Returns false in case of bitstream error.
 int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
                               int last_row);
 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
@ -114,7 +125,7 @@ void VP8LDelete(VP8LDecoder* const dec);
 //------------------------------------------------------------------------------
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Main decoding functions for WEBP images.
@ -16,10 +18,6 @@
 #include "./webpi.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // RIFF layout is:
 //   Offset  tag
@ -54,13 +52,14 @@ static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
 }
 // Validates the RIFF container (if detected) and skips over it.
-// If a RIFF container is detected,
+// If a RIFF container is detected, returns:
-// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
+//     VP8_STATUS_BITSTREAM_ERROR for invalid header,
-//         VP8_STATUS_OK otherwise.
+//     VP8_STATUS_NOT_ENOUGH_DATA for truncated data if have_all_data is true,
 // and VP8_STATUS_OK otherwise.
 // In case there are not enough bytes (partial RIFF container), return 0 for
 // *riff_size. Else return the RIFF size extracted from the header.
 static VP8StatusCode ParseRIFF(const uint8_t** const data,
-                               size_t* const data_size,
+                               size_t* const data_size, int have_all_data,
                               size_t* const riff_size) {
  assert(data != NULL);
  assert(data_size != NULL);
@ -79,6 +78,9 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
      if (size > MAX_CHUNK_PAYLOAD) {
        return VP8_STATUS_BITSTREAM_ERROR;
      }
      if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
        return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
      }
      // We have a RIFF container. Skip it.
      *riff_size = size;
      *data += RIFF_HEADER_SIZE;
@ -192,6 +194,15 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }
    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
    // parsed all the optional chunks.
    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
    // below to allow incomplete VP8/VP8L chunks.
    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
        !memcmp(buf, "VP8L", TAG_SIZE)) {
      return VP8_STATUS_OK;
    }
    if (buf_size < disk_chunk_size) {             // Insufficient data.
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }
@ -199,9 +210,6 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
      *alpha_data = buf + CHUNK_HEADER_SIZE;
      *alpha_size = chunk_size;
    } else if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
               !memcmp(buf, "VP8L", TAG_SIZE)) {  // A valid VP8/VP8L header.
      return VP8_STATUS_OK;  // Found.
    }
    // We have a full and valid chunk; skip it.
@ -219,9 +227,8 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
 // extracted from the VP8/VP8L chunk header.
 // The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
 static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
-                                    size_t* const data_size,
+                                    size_t* const data_size, int have_all_data,
-                                    size_t riff_size,
+                                    size_t riff_size, size_t* const chunk_size,
                                    size_t* const chunk_size,
                                    int* const is_lossless) {
  const uint8_t* const data = *data_ptr;
  const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
@ -244,6 +251,9 @@ static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
    }
    if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
      return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
    }
    // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
    *chunk_size = size;
    *data_ptr += CHUNK_HEADER_SIZE;
@ -277,9 +287,18 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                          int* const height,
                                          int* const has_alpha,
                                          int* const has_animation,
                                          int* const format,
                                          WebPHeaderStructure* const headers) {
  int canvas_width = 0;
  int canvas_height = 0;
  int image_width = 0;
  int image_height = 0;
  int found_riff = 0;
  int found_vp8x = 0;
  int animation_present = 0;
  int fragments_present = 0;
  const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
  VP8StatusCode status;
  WebPHeaderStructure hdrs;
@ -291,7 +310,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
  hdrs.data_size = data_size;
  // Skip over RIFF header.
-  status = ParseRIFF(&data, &data_size, &hdrs.riff_size);
+  status = ParseRIFF(&data, &data_size, have_all_data, &hdrs.riff_size);
  if (status != VP8_STATUS_OK) {
    return status;   // Wrong RIFF header / insufficient data.
  }
@ -300,23 +319,35 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
  // Skip over VP8X.
  {
    uint32_t flags = 0;
-    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
+    status = ParseVP8X(&data, &data_size, &found_vp8x,
                       &canvas_width, &canvas_height, &flags);
    if (status != VP8_STATUS_OK) {
      return status;  // Wrong VP8X / insufficient data.
    }
    animation_present = !!(flags & ANIMATION_FLAG);
    fragments_present = !!(flags & FRAGMENTS_FLAG);
    if (!found_riff && found_vp8x) {
      // Note: This restriction may be removed in the future, if it becomes
      // necessary to send VP8X chunk to the decoder.
      return VP8_STATUS_BITSTREAM_ERROR;
    }
    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
-    if (has_animation != NULL) *has_animation = !!(flags & ANIMATION_FLAG);
+    if (has_animation != NULL) *has_animation = animation_present;
-    if (found_vp8x && headers == NULL) {
+    if (format != NULL) *format = 0;   // default = undefined
-      return VP8_STATUS_OK;  // Return features from VP8X header.
+
    image_width = canvas_width;
    image_height = canvas_height;
    if (found_vp8x && (animation_present || fragments_present) &&
        headers == NULL) {
      status = VP8_STATUS_OK;
      goto ReturnWidthHeight;  // Just return features from VP8X header.
    }
  }
-  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
+  if (data_size < TAG_SIZE) {
    status = VP8_STATUS_NOT_ENOUGH_DATA;
    goto ReturnWidthHeight;
  }
  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
  if ((found_riff && found_vp8x) ||
@ -324,43 +355,49 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
    if (status != VP8_STATUS_OK) {
-      return status;  // Found an invalid chunk size / insufficient data.
+      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
    }
  }
  // Skip over VP8/VP8L header.
-  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
+  status = ParseVP8Header(&data, &data_size, have_all_data, hdrs.riff_size,
                          &hdrs.compressed_size, &hdrs.is_lossless);
  if (status != VP8_STATUS_OK) {
-    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
+    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
  }
  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
    return VP8_STATUS_BITSTREAM_ERROR;
  }
  if (format != NULL && !(animation_present || fragments_present)) {
    *format = hdrs.is_lossless ? 2 : 1;
  }
  if (!hdrs.is_lossless) {
    if (data_size < VP8_FRAME_HEADER_SIZE) {
-      return VP8_STATUS_NOT_ENOUGH_DATA;
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
      goto ReturnWidthHeight;
    }
    // Validates raw VP8 data.
-    if (!VP8GetInfo(data, data_size,
+    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
-                    (uint32_t)hdrs.compressed_size, width, height)) {
+                    &image_width, &image_height)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  } else {
    if (data_size < VP8L_FRAME_HEADER_SIZE) {
-      return VP8_STATUS_NOT_ENOUGH_DATA;
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
      goto ReturnWidthHeight;
    }
    // Validates raw VP8L data.
-    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
+    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  }
-
+  // Validates image size coherency.
-  if (has_alpha != NULL) {
+  if (found_vp8x) {
-    // If the data did not contain a VP8X/VP8L chunk the only definitive way
+    if (canvas_width != image_width || canvas_height != image_height) {
-    // to set this is by looking for alpha data (from an ALPH chunk).
+      return VP8_STATUS_BITSTREAM_ERROR;
-    *has_alpha |= (hdrs.alpha_data != NULL);
+    }
  }
  if (headers != NULL) {
    *headers = hdrs;
@ -368,7 +405,20 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
    assert(headers->offset == headers->data_size - data_size);
  }
-  return VP8_STATUS_OK;  // Return features from VP8 header.
+ ReturnWidthHeight:
  if (status == VP8_STATUS_OK ||
      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
    if (has_alpha != NULL) {
      // If the data did not contain a VP8X/VP8L chunk the only definitive way
      // to set this is by looking for alpha data (from an ALPH chunk).
      *has_alpha |= (hdrs.alpha_data != NULL);
    }
    if (width != NULL) *width = image_width;
    if (height != NULL) *height = image_height;
    return VP8_STATUS_OK;
  } else {
    return status;
  }
 }
 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
@ -377,7 +427,8 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
  assert(headers != NULL);
  // fill out headers, ignore width/height/has_alpha.
  status = ParseHeadersInternal(headers->data, headers->data_size,
-                                NULL, NULL, NULL, &has_animation, headers);
+                                NULL, NULL, NULL, &has_animation,
                                NULL, headers);
  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
    // TODO(jzern): full support of animation frames will require API additions.
    if (has_animation) {
@ -391,7 +442,7 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
 // WebPDecParams
 void WebPResetDecParams(WebPDecParams* const params) {
-  if (params) {
+  if (params != NULL) {
    memset(params, 0, sizeof(*params));
  }
 }
@ -408,6 +459,7 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
  headers.data = data;
  headers.data_size = data_size;
  headers.have_all_data = 1;
  status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
  if (status != VP8_STATUS_OK) {
    return status;
@ -424,11 +476,6 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
    if (dec == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
 #ifdef WEBP_USE_THREAD
    dec->use_threads_ = params->options && (params->options->use_threads > 0);
 #else
    dec->use_threads_ = 0;
 #endif
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
@ -440,6 +487,10 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                     params->output);
      if (status == VP8_STATUS_OK) {  // Decode
        // This change must be done before calling VP8Decode()
        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
                                             io.width, io.height);
        VP8InitDithering(params->options, dec);
        if (!VP8Decode(dec, &io)) {
          status = dec->status_;
        }
@ -469,6 +520,12 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
  if (status != VP8_STATUS_OK) {
    WebPFreeDecBuffer(params->output);
  }
 #if WEBP_DECODER_ABI_VERSION > 0x0203
  if (params->options != NULL && params->options->flip) {
    status = WebPFlipBuffer(params->output);
  }
 #endif
  return status;
 }
@ -626,7 +683,6 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
  assert(features != NULL);
  memset(features, 0, sizeof(*features));
  features->bitstream_version = 0;
 }
 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
@ -640,7 +696,7 @@ static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
  return ParseHeadersInternal(data, data_size,
                              &features->width, &features->height,
                              &features->has_alpha, &features->has_animation,
-                              NULL);
+                              &features->format, NULL);
 }
 //------------------------------------------------------------------------------
@ -734,9 +790,9 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
    h = options->crop_height;
    x = options->crop_left;
    y = options->crop_top;
-    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420 or YUV422
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420
      x &= ~1;
-      y &= ~1;    // TODO(later): only for YUV420, not YUV422.
+      y &= ~1;
    }
    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
      return 0;  // out of frame boundary error
@ -778,6 +834,3 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
@ -12,7 +14,7 @@
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
@ -52,6 +54,7 @@ void WebPResetDecParams(WebPDecParams* const params);
 typedef struct {
  const uint8_t* data;         // input buffer
  size_t data_size;            // input buffer size
  int have_all_data;           // true if all data is known to be available
  size_t offset;               // offset to main data chunk (VP8 or VP8L)
  const uint8_t* alpha_data;   // points to alpha chunk (if present)
  size_t alpha_data_size;      // alpha chunk size
@ -91,10 +94,15 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 // dimension / etc.). If *options is not NULL, also verify that the options'
 // parameters are valid and apply them to the width/height dimensions of the
 // output buffer. This takes cropping / scaling / rotation into account.
 // Also incorporates the options->flip flag to flip the buffer parameters if
 // needed.
 VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                    const WebPDecoderOptions* const options,
                                    WebPDecBuffer* const buffer);
 // Flip buffer vertically by negating the various strides.
 VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
 // Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
 // memory (still held by 'src').
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
@ -103,11 +111,9 @@ void WebPCopyDecBuffer(const WebPDecBuffer* const src,
 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
 //------------------------------------------------------------------------------
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -1,4 +1,3 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
 lib_LTLIBRARIES = libwebpdemux.la
 libwebpdemux_la_SOURCES =
@ -10,6 +9,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h
 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 0:0:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:2:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -1,15 +1,17 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  WebP container demux.
 //
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "../webp/config.h"
 #endif
 #include <assert.h>
@ -21,13 +23,9 @@
 #include "../webp/demux.h"
 #include "../webp/format_constants.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 1
+#define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 0
+#define DMUX_REV_VERSION 2
 typedef struct {
  size_t start_;        // start location of the data
@ -45,8 +43,10 @@ typedef struct {
 typedef struct Frame {
  int x_offset_, y_offset_;
  int width_, height_;
  int has_alpha_;
  int duration_;
  WebPMuxAnimDispose dispose_method_;
  WebPMuxAnimBlend blend_method_;
  int is_fragment_;  // this is a frame fragment (and not a full frame).
  int frame_num_;  // the referent frame number for use in assembling fragments.
  int complete_;   // img_components_ contains a full image.
@ -71,6 +71,7 @@ struct WebPDemuxer {
  Frame* frames_;
  Frame** frames_tail_;
  Chunk* chunks_;  // non-image chunks
  Chunk** chunks_tail_;
 };
 typedef enum {
@ -175,10 +176,9 @@ static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
 // Secondary chunk parsing
 static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
-  Chunk** c = &dmux->chunks_;
+  *dmux->chunks_tail_ = chunk;
  while (*c != NULL) c = &(*c)->next_;
  *c = chunk;
  chunk->next_ = NULL;
  dmux->chunks_tail_ = &chunk->next_;
 }
 // Add a frame to the end of the list, ensuring the last frame is complete.
@ -194,18 +194,13 @@ static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
 }
 // Store image bearing chunks to 'frame'.
 // If 'has_vp8l_alpha' is not NULL, it will be set to true if the frame is a
 // lossless image with alpha.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
-                              MemBuffer* const mem, Frame* const frame,
+                              MemBuffer* const mem, Frame* const frame) {
                              int* const has_vp8l_alpha) {
  int alpha_chunks = 0;
  int image_chunks = 0;
  int done = (MemDataSize(mem) < min_size);
  ParseStatus status = PARSE_OK;
  if (has_vp8l_alpha != NULL) *has_vp8l_alpha = 0;  // Default.
  if (done) return PARSE_NEED_MORE_DATA;
  do {
@ -227,6 +222,7 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
          ++alpha_chunks;
          frame->img_components_[1].offset_ = chunk_start_offset;
          frame->img_components_[1].size_ = chunk_size;
          frame->has_alpha_ = 1;
          frame->frame_num_ = frame_num;
          Skip(mem, payload_available);
        } else {
@ -256,7 +252,7 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
          frame->img_components_[0].size_ = chunk_size;
          frame->width_ = features.width;
          frame->height_ = features.height;
-          if (has_vp8l_alpha != NULL) *has_vp8l_alpha = features.has_alpha;
+          frame->has_alpha_ |= features.has_alpha;
          frame->frame_num_ = frame_num;
          frame->complete_ = (status == PARSE_OK);
          Skip(mem, payload_available);
@ -293,7 +289,7 @@ static ParseStatus NewFrame(const MemBuffer* const mem,
  if (actual_size < min_size) return PARSE_ERROR;
  if (MemDataSize(mem) < min_size)  return PARSE_NEED_MORE_DATA;
-  *frame = (Frame*)calloc(1, sizeof(**frame));
+  *frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(**frame));
  return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
 }
@ -301,9 +297,10 @@ static ParseStatus NewFrame(const MemBuffer* const mem,
 // 'frame_chunk_size' is the previously validated, padded chunk size.
 static ParseStatus ParseAnimationFrame(
    WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
-  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
  const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
  int added_frame = 0;
  int bits;
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status =
@ -315,16 +312,19 @@ static ParseStatus ParseAnimationFrame(
  frame->width_          = 1 + ReadLE24s(mem);
  frame->height_         = 1 + ReadLE24s(mem);
  frame->duration_       = ReadLE24s(mem);
-  frame->dispose_method_ = (WebPMuxAnimDispose)(ReadByte(mem) & 1);
+  bits = ReadByte(mem);
  frame->dispose_method_ =
      (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
  frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
  if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
    WebPSafeFree(frame);
    return PARSE_ERROR;
  }
  // Store a frame only if the animation flag is set there is some data for
  // this frame is available.
-  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame,
+  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
-                      NULL);
+  if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
    added_frame = AddFrame(dmux, frame);
    if (added_frame) {
      ++dmux->num_frames_;
@ -333,7 +333,7 @@ static ParseStatus ParseAnimationFrame(
    }
  }
-  if (!added_frame) free(frame);
+  if (!added_frame) WebPSafeFree(frame);
  return status;
 }
@ -343,7 +343,7 @@ static ParseStatus ParseAnimationFrame(
 static ParseStatus ParseFragment(WebPDemuxer* const dmux,
                                 uint32_t fragment_chunk_size) {
  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
-  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
  int added_fragment = 0;
  MemBuffer* const mem = &dmux->mem_;
@ -356,10 +356,10 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
  frame->x_offset_ = 2 * ReadLE24s(mem);
  frame->y_offset_ = 2 * ReadLE24s(mem);
-  // Store a fragment only if the fragments flag is set there is some data for
+  // Store a fragment only if the 'fragments' flag is set and there is some
-  // this fragment is available.
+  // data available.
-  status = StoreFrame(frame_num, frgm_payload_size, mem, frame, NULL);
+  status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
-  if (status != PARSE_ERROR && has_fragments && frame->frame_num_ > 0) {
+  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
    added_fragment = AddFrame(dmux, frame);
    if (!added_fragment) {
      status = PARSE_ERROR;
@ -368,7 +368,7 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
    }
  }
-  if (!added_fragment) free(frame);
+  if (!added_fragment) WebPSafeFree(frame);
  return status;
 }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
@ -379,7 +379,7 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
 // Returns true on success, false otherwise.
 static int StoreChunk(WebPDemuxer* const dmux,
                      size_t start_offset, uint32_t size) {
-  Chunk* const chunk = (Chunk*)calloc(1, sizeof(*chunk));
+  Chunk* const chunk = (Chunk*)WebPSafeCalloc(1ULL, sizeof(*chunk));
  if (chunk == NULL) return 0;
  chunk->data_.offset_ = start_offset;
@ -391,20 +391,20 @@ static int StoreChunk(WebPDemuxer* const dmux,
 // -----------------------------------------------------------------------------
 // Primary chunk parsing
-static int ReadHeader(MemBuffer* const mem) {
+static ParseStatus ReadHeader(MemBuffer* const mem) {
  const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
  uint32_t riff_size;
  // Basic file level validation.
-  if (MemDataSize(mem) < min_size) return 0;
+  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
  if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
      memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
-    return 0;
+    return PARSE_ERROR;
  }
  riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
-  if (riff_size < CHUNK_HEADER_SIZE) return 0;
+  if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
-  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
  // There's no point in reading past the end of the RIFF chunk
  mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
@ -413,7 +413,7 @@ static int ReadHeader(MemBuffer* const mem) {
  }
  Skip(mem, RIFF_HEADER_SIZE);
-  return 1;
+  return PARSE_OK;
 }
 static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
@ -421,25 +421,25 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status;
-  int has_vp8l_alpha = 0;  // Frame contains a lossless image with alpha.
+  int image_added = 0;
  if (dmux->frames_ != NULL) return PARSE_ERROR;
  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
-  frame = (Frame*)calloc(1, sizeof(*frame));
+  frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
  if (frame == NULL) return PARSE_ERROR;
  // For the single image case we allow parsing of a partial frame, but we need
  // at least CHUNK_HEADER_SIZE for parsing.
-  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame,
+  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
                      &has_vp8l_alpha);
  if (status != PARSE_ERROR) {
    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
    // Clear any alpha when the alpha flag is missing.
    if (!has_alpha && frame->img_components_[1].size_ > 0) {
      frame->img_components_[1].offset_ = 0;
      frame->img_components_[1].size_ = 0;
      frame->has_alpha_ = 0;
    }
    // Use the frame width/height as the canvas values for non-vp8x files.
@ -448,47 +448,26 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
      dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
      dmux->canvas_width_ = frame->width_;
      dmux->canvas_height_ = frame->height_;
-      dmux->feature_flags_ |= has_vp8l_alpha ? ALPHA_FLAG : 0;
+      dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
    }
-    AddFrame(dmux, frame);
+    if (!AddFrame(dmux, frame)) {
-    dmux->num_frames_ = 1;
+      status = PARSE_ERROR;  // last frame was left incomplete
    } else {
-    free(frame);
+      image_added = 1;
      dmux->num_frames_ = 1;
    }
  }
  if (!image_added) WebPSafeFree(frame);
  return status;
 }
-static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
  MemBuffer* const mem = &dmux->mem_;
  int anim_chunks = 0;
  uint32_t vp8x_size;
  ParseStatus status = PARSE_OK;
  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
  dmux->is_ext_format_ = 1;
  Skip(mem, TAG_SIZE);  // VP8X
  vp8x_size = ReadLE32(mem);
  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
  vp8x_size += vp8x_size & 1;
  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
  dmux->feature_flags_ = ReadByte(mem);
  Skip(mem, 3);  // Reserved.
  dmux->canvas_width_  = 1 + ReadLE24s(mem);
  dmux->canvas_height_ = 1 + ReadLE24s(mem);
  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
    return PARSE_ERROR;  // image final dimension is too large
  }
  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
  do {
    int store_chunk = 1;
    const size_t chunk_start_offset = mem->start_;
@ -507,7 +486,7 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
      case MKFOURCC('V', 'P', '8', ' '):
      case MKFOURCC('V', 'P', '8', 'L'): {
        // check that this isn't an animation (all frames should be in an ANMF).
-        if (anim_chunks > 0) return PARSE_ERROR;
+        if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
        Rewind(mem, CHUNK_HEADER_SIZE);
        status = ParseSingleImage(dmux);
@ -544,14 +523,14 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
        goto Skip;
      }
      case MKFOURCC('X', 'M', 'P', ' '): {
        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
        goto Skip;
      }
      case MKFOURCC('E', 'X', 'I', 'F'): {
        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
        goto Skip;
      }
      case MKFOURCC('X', 'M', 'P', ' '): {
        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
        goto Skip;
      }
 Skip:
      default: {
        if (chunk_size_padded <= MemDataSize(mem)) {
@ -580,6 +559,37 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
  return status;
 }
 static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
  MemBuffer* const mem = &dmux->mem_;
  uint32_t vp8x_size;
  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
  dmux->is_ext_format_ = 1;
  Skip(mem, TAG_SIZE);  // VP8X
  vp8x_size = ReadLE32(mem);
  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
  vp8x_size += vp8x_size & 1;
  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
  dmux->feature_flags_ = ReadByte(mem);
  Skip(mem, 3);  // Reserved.
  dmux->canvas_width_  = 1 + ReadLE24s(mem);
  dmux->canvas_height_ = 1 + ReadLE24s(mem);
  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
    return PARSE_ERROR;  // image final dimension is too large
  }
  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
  return ParseVP8XChunks(dmux);
 }
 // -----------------------------------------------------------------------------
 // Format validation
@ -594,18 +604,42 @@ static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
  return 1;
 }
 // If 'exact' is true, check that the image resolution matches the canvas.
 // If 'exact' is false, check that the x/y offsets do not exceed the canvas.
 // TODO(jzern): this is insufficient in the fragmented image case if the
 // expectation is that the fragments completely cover the canvas.
 static int CheckFrameBounds(const Frame* const frame, int exact,
                            int canvas_width, int canvas_height) {
  if (exact) {
    if (frame->x_offset_ != 0 || frame->y_offset_ != 0) {
      return 0;
    }
    if (frame->width_ != canvas_width || frame->height_ != canvas_height) {
      return 0;
    }
  } else {
    if (frame->x_offset_ < 0 || frame->y_offset_ < 0) return 0;
    if (frame->width_ + frame->x_offset_ > canvas_width) return 0;
    if (frame->height_ + frame->y_offset_ > canvas_height) return 0;
  }
  return 1;
 }
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
-  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const Frame* f;
+  const Frame* f = dmux->frames_;
  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
  if (dmux->loop_count_ < 0) return 0;
  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
 #ifndef WEBP_EXPERIMENTAL_FEATURES
  if (is_fragmented) return 0;
 #endif
-  for (f = dmux->frames_; f != NULL; f = f->next_) {
+  while (f != NULL) {
    const int cur_frame_set = f->frame_num_;
    int frame_count = 0, fragment_count = 0;
@ -615,9 +649,10 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
      const ChunkData* const image = f->img_components_;
      const ChunkData* const alpha = f->img_components_ + 1;
-      if (!has_fragments && f->is_fragment_) return 0;
+      if (is_fragmented && !f->is_fragment_) return 0;
-      if (!has_frames && f->frame_num_ > 1) return 0;
+      if (!is_fragmented && f->is_fragment_) return 0;
-      if (f->x_offset_ < 0 || f->y_offset_ < 0) return 0;
+      if (!is_animation && f->frame_num_ > 1) return 0;
      if (f->complete_) {
        if (alpha->size_ == 0 && image->size_ == 0) return 0;
        // Ensure alpha precedes image bitstream.
@ -639,12 +674,17 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
        if (f->next_ != NULL) return 0;
      }
      if (f->width_ > 0 && f->height_ > 0 &&
          !CheckFrameBounds(f, !(is_animation || is_fragmented),
                            dmux->canvas_width_, dmux->canvas_height_)) {
        return 0;
      }
      fragment_count += f->is_fragment_;
      ++frame_count;
    }
-    if (!has_fragments && frame_count > 1) return 0;
+    if (!is_fragmented && frame_count > 1) return 0;
    if (fragment_count > 0 && frame_count != fragment_count) return 0;
    if (f == NULL) break;
  }
  return 1;
 }
@ -659,6 +699,7 @@ static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
  dmux->canvas_width_ = -1;
  dmux->canvas_height_ = -1;
  dmux->frames_tail_ = &dmux->frames_;
  dmux->chunks_tail_ = &dmux->chunks_;
  dmux->mem_ = *mem;
 }
@ -670,29 +711,40 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  MemBuffer mem;
  WebPDemuxer* dmux;
  if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;
  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
  if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
-  if (!ReadHeader(&mem)) return NULL;
+  status = ReadHeader(&mem);
  if (status != PARSE_OK) {
    if (state != NULL) {
      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
                                                : WEBP_DEMUX_PARSE_ERROR;
    }
    return NULL;
  }
  partial = (mem.buf_size_ < mem.riff_end_);
  if (!allow_partial && partial) return NULL;
-  dmux = (WebPDemuxer*)calloc(1, sizeof(*dmux));
+  dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
  if (dmux == NULL) return NULL;
  InitDemux(dmux, &mem);
  status = PARSE_ERROR;
  for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
    if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
      status = parser->parse(dmux);
      if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
      if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
      if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
      if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
      break;
    }
  }
-  if (state) *state = dmux->state_;
+  if (state != NULL) *state = dmux->state_;
  if (status == PARSE_ERROR) {
    WebPDemuxDelete(dmux);
@ -709,14 +761,14 @@ void WebPDemuxDelete(WebPDemuxer* dmux) {
  for (f = dmux->frames_; f != NULL;) {
    Frame* const cur_frame = f;
    f = f->next_;
-    free(cur_frame);
+    WebPSafeFree(cur_frame);
  }
  for (c = dmux->chunks_; c != NULL;) {
    Chunk* const cur_chunk = c;
    c = c->next_;
-    free(cur_chunk);
+    WebPSafeFree(cur_chunk);
  }
-  free(dmux);
+  WebPSafeFree(dmux);
 }
 // -----------------------------------------------------------------------------
@ -809,8 +861,10 @@ static int SynthesizeFrame(const WebPDemuxer* const dmux,
  iter->y_offset       = fragment->y_offset_;
  iter->width          = fragment->width_;
  iter->height         = fragment->height_;
  iter->has_alpha      = fragment->has_alpha_;
  iter->duration       = fragment->duration_;
  iter->dispose_method = fragment->dispose_method_;
  iter->blend_method   = fragment->blend_method_;
  iter->complete       = fragment->complete_;
  iter->fragment.bytes = payload;
  iter->fragment.size  = payload_size;
@ -946,6 +1000,3 @@ void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
  (void)iter;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }  // extern "C"
 #endif
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -1,5 +1,5 @@
-AM_CPPFLAGS = -I$(top_srcdir)/src
+noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
-noinst_LTLIBRARIES = libwebpdsp.la
+noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@ -9,23 +9,49 @@ common_HEADERS = ../webp/types.h
 commondir = $(includedir)/webp
 COMMON_SOURCES =
 COMMON_SOURCES += alpha_processing.c
 COMMON_SOURCES += cpu.c
 COMMON_SOURCES += dec.c
 COMMON_SOURCES += dec_clip_tables.c
 COMMON_SOURCES += dec_mips32.c
 COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dec_sse2.c
 COMMON_SOURCES += dsp.h
 COMMON_SOURCES += lossless.c
 COMMON_SOURCES += lossless.h
 COMMON_SOURCES += lossless_mips32.c
 COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += neon.h
 COMMON_SOURCES += upsampling.c
 COMMON_SOURCES += upsampling_neon.c
 COMMON_SOURCES += upsampling_sse2.c
 COMMON_SOURCES += yuv.c
 COMMON_SOURCES += yuv.h
 COMMON_SOURCES += yuv_mips32.c
 ENC_SOURCES =
 ENC_SOURCES += enc.c
 ENC_SOURCES += enc_mips32.c
 ENC_SOURCES += enc_neon.c
-ENC_SOURCES += enc_sse2.c
+
 libwebpdsp_avx2_la_SOURCES =
 libwebpdsp_avx2_la_SOURCES += enc_avx2.c
 libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
 libwebpdspdecode_sse2_la_SOURCES =
 libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += lossless_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += upsampling_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += yuv_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += yuv_tables_sse2.h
 libwebpdspdecode_sse2_la_CPPFLAGS = $(libwebpdsp_sse2_la_CPPFLAGS)
 libwebpdspdecode_sse2_la_CFLAGS = $(libwebpdsp_sse2_la_CFLAGS)
 libwebpdsp_sse2_la_SOURCES =
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
 libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
 libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la
 libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)
@ -33,12 +59,14 @@ noinst_HEADERS =
 noinst_HEADERS += ../dec/decode_vp8.h
 noinst_HEADERS += ../webp/decode.h
 libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
 libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_LIBADD = libwebpdsp_avx2.la libwebpdsp_sse2.la
 if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
  libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
  libwebpdspdecode_la_LIBADD = libwebpdspdecode_sse2.la
 endif
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -0,0 +1,335 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for processing transparent channel.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <assert.h>
 #include "./dsp.h"
 // Tables can be faster on some platform but incur some extra binary size (~2k).
 // #define USE_TABLES_FOR_ALPHA_MULT
 // -----------------------------------------------------------------------------
 #define MFIX 24    // 24bit fixed-point arithmetic
 #define HALF ((1u << MFIX) >> 1)
 #define KINV_255 ((1u << MFIX) / 255u)
 static uint32_t Mult(uint8_t x, uint32_t mult) {
  const uint32_t v = (x * mult + HALF) >> MFIX;
  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
  return v;
 }
 #ifdef USE_TABLES_FOR_ALPHA_MULT
 static const uint32_t kMultTables[2][256] = {
  {    // (255u << MFIX) / alpha
    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
  {   // alpha * KINV_255
    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
 };
 static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
  return kMultTables[!inverse][a];
 }
 #else
 static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
  return inverse ? (255u << MFIX) / a : a * KINV_255;
 }
 #endif    // USE_TABLES_FOR_ALPHA_MULT
 static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t argb = ptr[x];
    if (argb < 0xff000000u) {      // alpha < 255
      if (argb <= 0x00ffffffu) {   // alpha == 0
        ptr[x] = 0;
      } else {
        const uint32_t alpha = (argb >> 24) & 0xff;
        const uint32_t scale = GetScale(alpha, inverse);
        uint32_t out = argb & 0xff000000u;
        out |= Mult(argb >>  0, scale) <<  0;
        out |= Mult(argb >>  8, scale) <<  8;
        out |= Mult(argb >> 16, scale) << 16;
        ptr[x] = out;
      }
    }
  }
 }
 static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
                    int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t a = alpha[x];
    if (a != 255) {
      if (a == 0) {
        ptr[x] = 0;
      } else {
        const uint32_t scale = GetScale(a, inverse);
        ptr[x] = Mult(ptr[x], scale);
      }
    }
  }
 }
 #undef KINV_255
 #undef HALF
 #undef MFIX
 void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
 void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
                    int width, int inverse);
 //------------------------------------------------------------------------------
 // Generic per-plane calls
 void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                      int inverse) {
  int n;
  for (n = 0; n < num_rows; ++n) {
    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
    ptr += stride;
  }
 }
 void WebPMultRows(uint8_t* ptr, int stride,
                  const uint8_t* alpha, int alpha_stride,
                  int width, int num_rows, int inverse) {
  int n;
  for (n = 0; n < num_rows; ++n) {
    WebPMultRow(ptr, alpha, width, inverse);
    ptr += stride;
    alpha += alpha_stride;
  }
 }
 //------------------------------------------------------------------------------
 // Premultiplied modes
 // non dithered-modes
 // (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
 // for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
 // one can use instead: (x * a * 65793 + (1 << 23)) >> 24
 #if 1     // (int)(x * a / 255.)
 #define MULTIPLIER(a)   ((a) * 32897U)
 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
 #else     // (int)(x * a / 255. + .5)
 #define MULTIPLIER(a) ((a) * 65793U)
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif
 static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
                               int w, int h, int stride) {
  while (h-- > 0) {
    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
    int i;
    for (i = 0; i < w; ++i) {
      const uint32_t a = alpha[4 * i];
      if (a != 0xff) {
        const uint32_t mult = MULTIPLIER(a);
        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
      }
    }
    rgba += stride;
  }
 }
 #undef MULTIPLIER
 #undef PREMULTIPLY
 // rgbA4444
 #define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
 static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
  return (x & 0xf0) | (x >> 4);
 }
 static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
  return (x & 0x0f) | (x << 4);
 }
 static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
  return (x * m) >> 16;
 }
 static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
                                               int w, int h, int stride,
                                               int rg_byte_pos /* 0 or 1 */) {
  while (h-- > 0) {
    int i;
    for (i = 0; i < w; ++i) {
      const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
      const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
      const uint8_t a = ba & 0x0f;
      const uint32_t mult = MULTIPLIER(a);
      const uint8_t r = multiply(dither_hi(rg), mult);
      const uint8_t g = multiply(dither_lo(rg), mult);
      const uint8_t b = multiply(dither_hi(ba), mult);
      rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
      rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
    }
    rgba4444 += stride;
  }
 }
 #undef MULTIPLIER
 static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
                                   int w, int h, int stride) {
 #ifdef WEBP_SWAP_16BIT_CSP
  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
 #else
  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
 #endif
 }
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
  uint8_t alpha_mask = 0xff;
  int i, j;
  for (j = 0; j < height; ++j) {
    for (i = 0; i < width; ++i) {
      const uint8_t alpha_value = argb[4 * i];
      alpha[i] = alpha_value;
      alpha_mask &= alpha_value;
    }
    argb += argb_stride;
    alpha += alpha_stride;
  }
  return (alpha_mask == 0xff);
 }
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 //------------------------------------------------------------------------------
 // Init function
 extern void WebPInitAlphaProcessingSSE2(void);
 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
 void WebPInitAlphaProcessing(void) {
  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
  WebPMultARGBRow = MultARGBRow;
  WebPMultRow = MultRow;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
  WebPExtractAlpha = ExtractAlpha;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitAlphaProcessingSSE2();
    }
 #endif
  }
  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -0,0 +1,77 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for processing transparent channel.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 //------------------------------------------------------------------------------
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
  int i, j;
  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  __m128i all_alphas = all_0xff;
  // We must be able to access 3 extra bytes after the last written byte
  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
  // last byte of the quadruplet.
  const int limit = (width - 1) & ~7;
  for (j = 0; j < height; ++j) {
    const __m128i* src = (const __m128i*)argb;
    for (i = 0; i < limit; i += 8) {
      // load 32 argb bytes
      const __m128i a0 = _mm_loadu_si128(src + 0);
      const __m128i a1 = _mm_loadu_si128(src + 1);
      const __m128i b0 = _mm_and_si128(a0, a_mask);
      const __m128i b1 = _mm_and_si128(a1, a_mask);
      const __m128i c0 = _mm_packs_epi32(b0, b1);
      const __m128i d0 = _mm_packus_epi16(c0, c0);
      // store
      _mm_storel_epi64((__m128i*)&alpha[i], d0);
      // accumulate eight alpha 'and' in parallel
      all_alphas = _mm_and_si128(all_alphas, d0);
      src += 2;
    }
    for (; i < width; ++i) {
      const uint32_t alpha_value = argb[4 * i];
      alpha[i] = alpha_value;
      alpha_and &= alpha_value;
    }
    argb += argb_stride;
    alpha += alpha_stride;
  }
  // Combine the eight alpha 'and' into a 8-bit mask.
  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
  return (alpha_and == 0xff);
 }
 #endif   // WEBP_USE_SSE2
 //------------------------------------------------------------------------------
 // Init function
 extern void WebPInitAlphaProcessingSSE2(void);
 void WebPInitAlphaProcessingSSE2(void) {
 #if defined(WEBP_USE_SSE2)
  WebPExtractAlpha = ExtractAlpha;
 #endif
 }
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // CPU detection
@ -15,10 +17,6 @@
 #include <cpu-features.h>
 #endif
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // SSE2 detection.
 //
@ -31,19 +29,54 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    "cpuid\n"
    "xchg %%edi, %%ebx\n"
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "cpuid\n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
 #elif (defined(_M_X64) || defined(_M_IX86)) && \
      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
 #include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
 #elif defined(WEBP_MSC_SSE2)
 #define GetCPUInfo __cpuid
 #endif
 // NaCl has no support for xgetbv or the raw opcode.
 #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
 static WEBP_INLINE uint64_t xgetbv(void) {
  const uint32_t ecx = 0;
  uint32_t eax, edx;
  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  __asm__ volatile (
    ".byte 0x0f, 0x01, 0xd0\n"
    : "=a"(eax), "=d"(edx) : "c" (ecx));
  return ((uint64_t)edx << 32) | eax;
 }
 #elif (defined(_M_X64) || defined(_M_IX86)) && \
      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
 #include <immintrin.h>
 #define xgetbv() _xgetbv(0)
 #elif defined(_MSC_VER) && defined(_M_IX86)
 static WEBP_INLINE uint64_t xgetbv(void) {
  uint32_t eax_, edx_;
  __asm {
    xor ecx, ecx  // ecx = 0
    // Use the raw opcode for xgetbv for compatibility with older toolchains.
    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
    mov eax_, eax
    mov edx_, edx
  }
  return ((uint64_t)edx_ << 32) | eax_;
 }
 #else
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif
 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
  int cpu_info[4];
@ -54,10 +87,23 @@ static int x86CPUInfo(CPUFeature feature) {
  if (feature == kSSE3) {
    return 0 != (cpu_info[2] & 0x00000001);
  }
  if (feature == kAVX) {
    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
      // XMM state and YMM state enabled by the OS.
      return (xgetbv() & 0x6) == 0x6;
    }
  }
  if (feature == kAVX2) {
    if (x86CPUInfo(kAVX)) {
      GetCPUInfo(cpu_info, 7);
      return ((cpu_info[1] & 0x00000020) == 0x00000020);
    }
  }
  return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
-#elif defined(WEBP_ANDROID_NEON)
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
 static int AndroidCPUInfo(CPUFeature feature) {
  const AndroidCpuFamily cpu_family = android_getCpuFamily();
  const uint64_t cpu_features = android_getCpuFeatures();
@ -68,7 +114,7 @@ static int AndroidCPUInfo(CPUFeature feature) {
  return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
-#elif defined(__ARM_NEON__)
+#elif defined(WEBP_USE_NEON)
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
 static int armCPUInfo(CPUFeature feature) {
@ -76,10 +122,13 @@ static int armCPUInfo(CPUFeature feature) {
  return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 #elif defined(WEBP_USE_MIPS32)
 static int mipsCPUInfo(CPUFeature feature) {
  (void)feature;
  return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical decoding functions.
@ -12,42 +14,7 @@
 #include "./dsp.h"
 #include "../dec/vp8i.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // run-time tables (~4k)
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
 static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
 static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
 static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
 // We declare this variable 'volatile' to prevent instruction reordering
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 static void DspInitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255; ++i) {
      abs0[255 + i] = (i < 0) ? -i : i;
      abs1[255 + i] = abs0[255 + i] >> 1;
    }
    for (i = -1020; i <= 1020; ++i) {
      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
    }
    for (i = -112; i <= 112; ++i) {
      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
    }
    for (i = -255; i <= 255 + 255; ++i) {
      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
    }
    tables_ok = 1;
  }
 }
 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
@ -59,6 +26,14 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define STORE(x, y, v) \
  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
 #define STORE2(y, dc, d, c) do {    \
  const int DC = (dc);              \
  STORE(0, y, DC + (d));            \
  STORE(1, y, DC + (c));            \
  STORE(2, y, DC - (c));            \
  STORE(3, y, DC - (d));            \
 } while (0)
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
@ -101,7 +76,21 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    dst += BPS;
  }
 }
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
  const int c4 = MUL(in[4], kC2);
  const int d4 = MUL(in[4], kC1);
  const int c1 = MUL(in[1], kC2);
  const int d1 = MUL(in[1], kC1);
  STORE2(0, a + d4, d1, c1);
  STORE2(1, a + c4, d1, c1);
  STORE2(2, a - c4, d1, c1);
  STORE2(3, a - d4, d1, c1);
 }
 #undef MUL
 #undef STORE2
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
@ -126,10 +115,10 @@ static void TransformDC(const int16_t *in, uint8_t* dst) {
 }
 static void TransformDCUV(const int16_t* in, uint8_t* dst) {
-  if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
+  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
-  if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
+  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
-  if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
+  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
-  if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+  if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
 }
 #undef STORE
@ -164,7 +153,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
  }
 }
-void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 //------------------------------------------------------------------------------
 // Intra predictions
@ -173,7 +162,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
 static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
  const uint8_t* top = dst - BPS;
-  const uint8_t* const clip0 = clip1 + 255 - top[-1];
+  const uint8_t* const clip0 = VP8kclip1 - top[-1];
  int y;
  for (y = 0; y < size; ++y) {
    const uint8_t* const clip = clip0 + dst[-1];
@ -428,14 +417,9 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  int j;
 #ifndef WEBP_REFERENCE_IMPLEMENTATION
  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
  for (j = 0; j < 8; ++j) {
-    *(uint64_t*)(dst + j * BPS) = v;
+    memset(dst + j * BPS, value, 8);
  }
 #else
  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
 #endif
 }
 static void DC8uv(uint8_t *dst) {     // DC
@ -492,61 +476,62 @@ const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
 // 4 pixels in, 2 pixels out
 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
-  p[-step] = clip1[255 + p0 + a2];
+  p[-step] = VP8kclip1[p0 + a2];
-  p[    0] = clip1[255 + q0 - a1];
+  p[    0] = VP8kclip1[q0 - a1];
 }
 // 4 pixels in, 4 pixels out
 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
  const int a3 = (a1 + 1) >> 1;
-  p[-2*step] = clip1[255 + p1 + a3];
+  p[-2*step] = VP8kclip1[p1 + a3];
-  p[-  step] = clip1[255 + p0 + a2];
+  p[-  step] = VP8kclip1[p0 + a2];
-  p[      0] = clip1[255 + q0 - a1];
+  p[      0] = VP8kclip1[q0 - a1];
-  p[   step] = clip1[255 + q1 - a3];
+  p[   step] = VP8kclip1[q1 - a3];
 }
 // 6 pixels in, 6 pixels out
 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
-  const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
-  p[-3*step] = clip1[255 + p2 + a3];
+  p[-3*step] = VP8kclip1[p2 + a3];
-  p[-2*step] = clip1[255 + p1 + a2];
+  p[-2*step] = VP8kclip1[p1 + a2];
-  p[-  step] = clip1[255 + p0 + a1];
+  p[-  step] = VP8kclip1[p0 + a1];
-  p[      0] = clip1[255 + q0 - a1];
+  p[      0] = VP8kclip1[q0 - a1];
-  p[   step] = clip1[255 + q1 - a2];
+  p[   step] = VP8kclip1[q1 - a2];
-  p[ 2*step] = clip1[255 + q2 - a3];
+  p[ 2*step] = VP8kclip1[q2 - a3];
 }
 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
+  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
+  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
 static WEBP_INLINE int needs_filter2(const uint8_t* p,
                                     int step, int t, int it) {
-  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
-  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
+  const int p0 = p[-step], q0 = p[0];
-  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
+  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
-    return 0;
+  if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
-  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
+  return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
-         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
+         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
-         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
+         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
 //------------------------------------------------------------------------------
@ -554,8 +539,9 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
+    if (needs_filter(p + i, stride, thresh2)) {
      do_filter2(p + i, stride);
    }
  }
@ -563,8 +549,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
      do_filter2(p + i * stride, 1);
    }
  }
@ -592,8 +579,9 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -607,8 +595,9 @@ static WEBP_INLINE void FilterLoop26(uint8_t* p,
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -677,6 +666,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------
 VP8DecIdct2 VP8Transform;
 VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
@ -696,14 +686,22 @@ VP8SimpleFilterFunc VP8SimpleHFilter16i;
 extern void VP8DspInitSSE2(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
 void VP8DspInit(void) {
-  DspInitTables();
+  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
  VP8InitClipTables();
  VP8TransformWHT = TransformWHT;
  VP8Transform = TransformTwo;
  VP8TransformUV = TransformUV;
  VP8TransformDC = TransformDC;
  VP8TransformDCUV = TransformDCUV;
  VP8TransformAC3 = TransformAC3;
  VP8VFilter16 = VFilter16;
  VP8HFilter16 = HFilter16;
@ -719,7 +717,7 @@ void VP8DspInit(void) {
  VP8SimpleHFilter16i = SimpleHFilter16i;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo) {
+  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8DspInitSSE2();
@ -728,10 +726,11 @@ void VP8DspInit(void) {
    if (VP8GetCPUInfo(kNEON)) {
      VP8DspInitNEON();
    }
 #elif defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/dec_clip_tables.c
+++ b/src/dsp/dec_clip_tables.c
@ -0,0 +1,366 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Clipping tables for filtering
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 #define USE_STATIC_TABLES     // undefine to have run-time table initialization
 #ifdef USE_STATIC_TABLES
 static const uint8_t abs0[255 + 255 + 1] = {
  0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
  0xf3, 0xf2, 0xf1, 0xf0, 0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8,
  0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde, 0xdd, 0xdc,
  0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0,
  0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
  0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
  0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac,
  0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0,
  0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97, 0x96, 0x95, 0x94,
  0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88,
  0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c,
  0x7b, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
  0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64,
  0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58,
  0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c,
  0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
  0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
  0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28,
  0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c,
  0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
  0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
  0x03, 0x02, 0x01, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
  0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
  0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
  0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
  0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
  0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
  0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
  0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
  0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
  0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
  0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
  0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
  0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
  0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
  0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
  0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
  0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
  0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
  0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
 };
 static const int8_t sclip1[1020 + 1020 + 1] = {
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
  0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93,
  0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
  0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
  0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3,
  0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
  0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
  0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3,
  0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
  0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
  0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
  0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
  0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
  0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
 };
 static const int8_t sclip2[112 + 112 + 1] = {
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
  0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
 };
 static const uint8_t clip1[255 + 511 + 1] = {
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
  0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
  0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
  0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
  0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
  0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
  0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
  0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
  0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
  0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
  0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
  0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
  0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
  0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
  0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
  0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
  0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
  0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
  0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 #else
 // uninitialized tables
 static uint8_t abs0[255 + 255 + 1];
 static int8_t sclip1[1020 + 1020 + 1];
 static int8_t sclip2[112 + 112 + 1];
 static uint8_t clip1[255 + 511 + 1];
 // We declare this variable 'volatile' to prevent instruction reordering
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 #endif
 const int8_t* const VP8ksclip1 = &sclip1[1020];
 const int8_t* const VP8ksclip2 = &sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 void VP8InitClipTables(void) {
 #if !defined(USE_STATIC_TABLES)
  int i;
  if (!tables_ok) {
    for (i = -255; i <= 255; ++i) {
      abs0[255 + i] = (i < 0) ? -i : i;
    }
    for (i = -1020; i <= 1020; ++i) {
      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
    }
    for (i = -112; i <= 112; ++i) {
      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
    }
    for (i = -255; i <= 255 + 255; ++i) {
      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
    }
    tables_ok = 1;
  }
 #endif    // USE_STATIC_TABLES
 }
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -0,0 +1,578 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MIPS version of dsp functions
 //
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_MIPS32)
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 static WEBP_INLINE int abs_mips32(int x) {
  const int sign = x >> 31;
  return (x ^ sign) - sign;
 }
 // 4 pixels in, 2 pixels out
 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
  const int a1 = VP8ksclip2[(a + 4) >> 3];
  const int a2 = VP8ksclip2[(a + 3) >> 3];
  p[-step] = VP8kclip1[p0 + a2];
  p[    0] = VP8kclip1[q0 - a1];
 }
 // 4 pixels in, 4 pixels out
 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
  const int a1 = VP8ksclip2[(a + 4) >> 3];
  const int a2 = VP8ksclip2[(a + 3) >> 3];
  const int a3 = (a1 + 1) >> 1;
  p[-2 * step] = VP8kclip1[p1 + a3];
  p[-    step] = VP8kclip1[p0 + a2];
  p[        0] = VP8kclip1[q0 - a1];
  p[     step] = VP8kclip1[q1 - a3];
 }
 // 6 pixels in, 6 pixels out
 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
  p[-3 * step] = VP8kclip1[p2 + a3];
  p[-2 * step] = VP8kclip1[p1 + a2];
  p[-    step] = VP8kclip1[p0 + a1];
  p[        0] = VP8kclip1[q0 - a1];
  p[     step] = VP8kclip1[q1 - a2];
  p[ 2 * step] = VP8kclip1[q2 - a3];
 }
 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
 }
 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) <= thresh);
 }
 static WEBP_INLINE int needs_filter2(const uint8_t* p,
                                     int step, int t, int it) {
  const int p3 = p[-4 * step], p2 = p[-3 * step];
  const int p1 = p[-2 * step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
  if ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) > t) {
    return 0;
  }
  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
         abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
         abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
 }
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
        do_filter6(p, hstride);
      }
    }
    p += vstride;
  }
 }
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
        do_filter4(p, hstride);
      }
    }
    p += vstride;
  }
 }
 // on macroblock edges
 static void VFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
 }
 static void HFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
 }
 // 8-pixels wide variant, for chroma filtering
 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 // on three inner edges
 static void VFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  }
 }
 static void HFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  }
 }
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  for (i = 0; i < 16; ++i) {
    if (needs_filter(p + i, stride, thresh)) {
      do_filter2(p + i, stride);
    }
  }
 }
 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  for (i = 0; i < 16; ++i) {
    if (needs_filter(p + i * stride, 1, thresh)) {
      do_filter2(p + i * stride, 1);
    }
  }
 }
 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
    SimpleVFilter16(p, stride, thresh);
  }
 }
 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
    SimpleHFilter16(p, stride, thresh);
  }
 }
 static void TransformOne(const int16_t* in, uint8_t* dst) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14;
  int temp15, temp16, temp17, temp18;
  int16_t* p_in = (int16_t*)in;
  // loops unrolled and merged to avoid usage of tmp buffer
  // and to reduce number of stalls. MUL macro is written
  // in assembler and inlined
  __asm__ volatile(
    "lh       %[temp0],  0(%[in])                      \n\t"
    "lh       %[temp8],  16(%[in])                     \n\t"
    "lh       %[temp4],  8(%[in])                      \n\t"
    "lh       %[temp12], 24(%[in])                     \n\t"
    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
    "lh       %[temp1],  2(%[in])                      \n\t"
    "lh       %[temp5],  10(%[in])                     \n\t"
    "lh       %[temp9],  18(%[in])                     \n\t"
    "lh       %[temp13], 26(%[in])                     \n\t"
    "sra      %[temp8],  %[temp8],  16                 \n\t"
    "sra      %[temp17], %[temp17], 16                 \n\t"
    "sra      %[temp4],  %[temp4],  16                 \n\t"
    "sra      %[temp12], %[temp12], 16                 \n\t"
    "lh       %[temp2],  4(%[in])                      \n\t"
    "lh       %[temp6],  12(%[in])                     \n\t"
    "lh       %[temp10], 20(%[in])                     \n\t"
    "lh       %[temp14], 28(%[in])                     \n\t"
    "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
    "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
    "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
    "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
    "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
    "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
    "lh       %[temp3],  6(%[in])                      \n\t"
    "lh       %[temp7],  14(%[in])                     \n\t"
    "lh       %[temp11], 22(%[in])                     \n\t"
    "lh       %[temp15], 30(%[in])                     \n\t"
    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
    "sra      %[temp9],  %[temp9],  16                 \n\t"
    "sra      %[temp17], %[temp17], 16                 \n\t"
    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
    "sra      %[temp5],  %[temp5],  16                 \n\t"
    "sra      %[temp13], %[temp13], 16                 \n\t"
    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
    "sra      %[temp17], %[temp17], 16                 \n\t"
    "sra      %[temp14], %[temp14], 16                 \n\t"
    "sra      %[temp10], %[temp10], 16                 \n\t"
    "sra      %[temp6],  %[temp6],  16                 \n\t"
    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
    "addiu    %[temp8],  %[temp8],  4                  \n\t"
    "addiu    %[temp12], %[temp12], 4                  \n\t"
    "addiu    %[temp0],  %[temp0],  4                  \n\t"
    "addiu    %[temp4],  %[temp4],  4                  \n\t"
    "sra      %[temp17], %[temp17], 16                 \n\t"
    "sra      %[temp15], %[temp15], 16                 \n\t"
    "sra      %[temp11], %[temp11], 16                 \n\t"
    "sra      %[temp7],  %[temp7],  16                 \n\t"
    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
    "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
    "addu     %[temp11], %[temp16], %[temp7]           \n\t"
    "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
    "sra      %[temp10], %[temp10], 16                 \n\t"
    "sra      %[temp17], %[temp17], 16                 \n\t"
    "sra      %[temp9],  %[temp9],  16                 \n\t"
    "sra      %[temp11], %[temp11], 16                 \n\t"
    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
    "sra      %[temp14], %[temp14], 16                 \n\t"
    "sra      %[temp9],  %[temp9],  16                 \n\t"
    "sra      %[temp13], %[temp13], 16                 \n\t"
    "sra      %[temp15], %[temp15], 16                 \n\t"
    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
    "sra      %[temp2],  %[temp2],  16                 \n\t"
    "sra      %[temp13], %[temp13], 16                 \n\t"
    "sra      %[temp1],  %[temp1],  16                 \n\t"
    "sra      %[temp3],  %[temp3],  16                 \n\t"
    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
    "sra      %[temp6],  %[temp6],  16                 \n\t"
    "sra      %[temp1],  %[temp1],  16                 \n\t"
    "sra      %[temp5],  %[temp5],  16                 \n\t"
    "sra      %[temp7],  %[temp7],  16                 \n\t"
    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
    "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
    "subu     %[temp16], %[temp16], %[temp11]          \n\t"
    "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
    "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
    "sra      %[temp5],  %[temp5],  3                  \n\t"
    "sra      %[temp16], %[temp16], 3                  \n\t"
    "sra      %[temp11], %[temp11], 3                  \n\t"
    "sra      %[temp8],  %[temp8],  3                  \n\t"
    "addu     %[temp17], %[temp10], %[temp15]          \n\t"
    "subu     %[temp10], %[temp10], %[temp15]          \n\t"
    "addu     %[temp15], %[temp12], %[temp9]           \n\t"
    "subu     %[temp12], %[temp12], %[temp9]           \n\t"
    "sra      %[temp17], %[temp17], 3                  \n\t"
    "sra      %[temp10], %[temp10], 3                  \n\t"
    "sra      %[temp15], %[temp15], 3                  \n\t"
    "sra      %[temp12], %[temp12], 3                  \n\t"
    "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
    "subu     %[temp14], %[temp14], %[temp3]           \n\t"
    "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
    "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
    "sra      %[temp9],  %[temp9],  3                  \n\t"
    "sra      %[temp14], %[temp14], 3                  \n\t"
    "sra      %[temp3],  %[temp3],  3                  \n\t"
    "sra      %[temp0],  %[temp0],  3                  \n\t"
    "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
    "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
    "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
    "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
    "sra      %[temp13], %[temp13], 3                  \n\t"
    "sra      %[temp2],  %[temp2],  3                  \n\t"
    "sra      %[temp7],  %[temp7],  3                  \n\t"
    "sra      %[temp4],  %[temp4],  3                  \n\t"
    "addiu    %[temp6],  $zero,     255                \n\t"
    "lbu      %[temp1],  0(%[dst])                     \n\t"
    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
    "sra      %[temp5],  %[temp1],  8                  \n\t"
    "sra      %[temp18], %[temp1],  31                 \n\t"
    "beqz     %[temp5],  1f                            \n\t"
    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
  "1:                                                  \n\t"
    "lbu      %[temp18], 1(%[dst])                     \n\t"
    "sb       %[temp1],  0(%[dst])                     \n\t"
    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
    "sra      %[temp11], %[temp18], 8                  \n\t"
    "sra      %[temp1],  %[temp18], 31                 \n\t"
    "beqz     %[temp11], 2f                            \n\t"
    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
  "2:                                                  \n\t"
    "lbu      %[temp1],  2(%[dst])                     \n\t"
    "sb       %[temp18], 1(%[dst])                     \n\t"
    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
    "sra      %[temp8],  %[temp1],  8                  \n\t"
    "sra      %[temp18], %[temp1],  31                 \n\t"
    "beqz     %[temp8],  3f                            \n\t"
    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
  "3:                                                  \n\t"
    "lbu      %[temp18], 3(%[dst])                     \n\t"
    "sb       %[temp1],  2(%[dst])                     \n\t"
    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
    "sra      %[temp16], %[temp18], 8                  \n\t"
    "sra      %[temp1],  %[temp18], 31                 \n\t"
    "beqz     %[temp16], 4f                            \n\t"
    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
  "4:                                                  \n\t"
    "sb       %[temp18], 3(%[dst])                     \n\t"
    "lbu      %[temp5],  32(%[dst])                    \n\t"
    "lbu      %[temp8],  33(%[dst])                    \n\t"
    "lbu      %[temp11], 34(%[dst])                    \n\t"
    "lbu      %[temp16], 35(%[dst])                    \n\t"
    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
    "addu     %[temp16], %[temp16], %[temp10]          \n\t"
    "sra      %[temp18], %[temp5],  8                  \n\t"
    "sra      %[temp1],  %[temp5],  31                 \n\t"
    "beqz     %[temp18], 5f                            \n\t"
    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
  "5:                                                  \n\t"
    "sra      %[temp18], %[temp8],  8                  \n\t"
    "sra      %[temp1],  %[temp8],  31                 \n\t"
    "beqz     %[temp18], 6f                            \n\t"
    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
    "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
  "6:                                                  \n\t"
    "sra      %[temp18], %[temp11], 8                  \n\t"
    "sra      %[temp1],  %[temp11], 31                 \n\t"
    "sra      %[temp17], %[temp16], 8                  \n\t"
    "sra      %[temp15], %[temp16], 31                 \n\t"
    "beqz     %[temp18], 7f                            \n\t"
    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
    "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
  "7:                                                  \n\t"
    "beqz     %[temp17], 8f                            \n\t"
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
  "8:                                                  \n\t"
    "sb       %[temp5],  32(%[dst])                    \n\t"
    "sb       %[temp8],  33(%[dst])                    \n\t"
    "sb       %[temp11], 34(%[dst])                    \n\t"
    "sb       %[temp16], 35(%[dst])                    \n\t"
    "lbu      %[temp5],  64(%[dst])                    \n\t"
    "lbu      %[temp8],  65(%[dst])                    \n\t"
    "lbu      %[temp11], 66(%[dst])                    \n\t"
    "lbu      %[temp16], 67(%[dst])                    \n\t"
    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
    "addu     %[temp16], %[temp16], %[temp14]          \n\t"
    "sra      %[temp18], %[temp5],  8                  \n\t"
    "sra      %[temp1],  %[temp5],  31                 \n\t"
    "sra      %[temp17], %[temp8],  8                  \n\t"
    "sra      %[temp15], %[temp8],  31                 \n\t"
    "sra      %[temp12], %[temp11], 8                  \n\t"
    "sra      %[temp10], %[temp11], 31                 \n\t"
    "sra      %[temp9],  %[temp16], 8                  \n\t"
    "sra      %[temp3],  %[temp16], 31                 \n\t"
    "beqz     %[temp18], 9f                            \n\t"
    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
  "9:                                                  \n\t"
    "beqz     %[temp17], 10f                           \n\t"
    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
  "10:                                                 \n\t"
    "beqz     %[temp12], 11f                           \n\t"
    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
  "11:                                                 \n\t"
    "beqz     %[temp9],  12f                           \n\t"
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
  "12:                                                 \n\t"
    "sb       %[temp5],  64(%[dst])                    \n\t"
    "sb       %[temp8],  65(%[dst])                    \n\t"
    "sb       %[temp11], 66(%[dst])                    \n\t"
    "sb       %[temp16], 67(%[dst])                    \n\t"
    "lbu      %[temp5],  96(%[dst])                    \n\t"
    "lbu      %[temp8],  97(%[dst])                    \n\t"
    "lbu      %[temp11], 98(%[dst])                    \n\t"
    "lbu      %[temp16], 99(%[dst])                    \n\t"
    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
    "addu     %[temp16], %[temp16], %[temp2]           \n\t"
    "sra      %[temp18], %[temp5],  8                  \n\t"
    "sra      %[temp1],  %[temp5],  31                 \n\t"
    "sra      %[temp17], %[temp8],  8                  \n\t"
    "sra      %[temp15], %[temp8],  31                 \n\t"
    "sra      %[temp12], %[temp11], 8                  \n\t"
    "sra      %[temp10], %[temp11], 31                 \n\t"
    "sra      %[temp9],  %[temp16], 8                  \n\t"
    "sra      %[temp3],  %[temp16], 31                 \n\t"
    "beqz     %[temp18], 13f                           \n\t"
    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
  "13:                                                 \n\t"
    "beqz     %[temp17], 14f                           \n\t"
    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
  "14:                                                 \n\t"
    "beqz     %[temp12], 15f                           \n\t"
    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
  "15:                                                 \n\t"
    "beqz     %[temp9],  16f                           \n\t"
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
  "16:                                                 \n\t"
    "sb       %[temp5],  96(%[dst])                    \n\t"
    "sb       %[temp8],  97(%[dst])                    \n\t"
    "sb       %[temp11], 98(%[dst])                    \n\t"
    "sb       %[temp16], 99(%[dst])                    \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
      [temp18]"=&r"(temp18)
    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
    : "memory", "hi", "lo"
  );
 }
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
  }
 }
 #endif  // WEBP_USE_MIPS32
 //------------------------------------------------------------------------------
 // Entry point
 extern void VP8DspInitMIPS32(void);
 void VP8DspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
  VP8InitClipTables();
  VP8Transform = TransformTwo;
  VP8VFilter16 = VFilter16;
  VP8HFilter16 = HFilter16;
  VP8VFilter8 = VFilter8;
  VP8HFilter8 = HFilter8;
  VP8VFilter16i = VFilter16i;
  VP8HFilter16i = HFilter16i;
  VP8VFilter8i = VFilter8i;
  VP8HFilter8i = HFilter8i;
  VP8SimpleVFilter16 = SimpleVFilter16;
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
 #endif  // WEBP_USE_MIPS32
 }
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of some decoding functions (idct, loop filtering).
@ -12,19 +14,19 @@
 #include "./dsp.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #if defined(WEBP_USE_SSE2)
 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
 // one it seems => disable it by default. Uncomment the following to enable:
 // #define USE_TRANSFORM_AC3
 #include <emmintrin.h>
 #include "../dec/vp8i.h"
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
-static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -199,16 +201,16 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
      // Load eight bytes/pixels per line.
-      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
+      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
-      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
+      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
-      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
+      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
-      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
    } else {
      // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
+      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
+      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
+      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
@ -228,20 +230,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
+      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
-      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
+      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
-      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
+      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
-      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
    } else {
      // Store four bytes/pixels per line.
-      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
+      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
+      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
+      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
    }
  }
 }
 #if defined(USE_TRANSFORM_AC3)
 #define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  static const int kC1 = 20091 + (1 << 16);
  static const int kC2 = 35468;
  const __m128i A = _mm_set1_epi16(in[0] + 4);
  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
  const int c1 = MUL(in[1], kC2);
  const int d1 = MUL(in[1], kC1);
  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
  const __m128i B = _mm_adds_epi16(A, CD);
  const __m128i m0 = _mm_adds_epi16(B, d4);
  const __m128i m1 = _mm_adds_epi16(B, c4);
  const __m128i m2 = _mm_subs_epi16(B, c4);
  const __m128i m3 = _mm_subs_epi16(B, d4);
  const __m128i zero = _mm_setzero_si128();
  // Load the source pixels.
  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
  // Convert to 16b.
  dst0 = _mm_unpacklo_epi8(dst0, zero);
  dst1 = _mm_unpacklo_epi8(dst1, zero);
  dst2 = _mm_unpacklo_epi8(dst2, zero);
  dst3 = _mm_unpacklo_epi8(dst3, zero);
  // Add the inverse transform.
  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
  // Unsigned saturate to 8b.
  dst0 = _mm_packus_epi16(dst0, dst0);
  dst1 = _mm_packus_epi16(dst1, dst1);
  dst2 = _mm_packus_epi16(dst2, dst2);
  dst3 = _mm_packus_epi16(dst3, dst3);
  // Store the results.
  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
 }
 #undef MUL
 #endif   // USE_TRANSFORM_AC3
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)
@ -250,20 +298,15 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    _mm_subs_epu8((q), (p)),                                                   \
    _mm_subs_epu8((p), (q)))
-// Shift each byte of "a" by N bits while preserving by the sign bit.
+// Shift each byte of "x" by 3 bits while preserving by the sign bit.
-//
+static WEBP_INLINE void SignedShift8b(__m128i* const x) {
-// It first shifts the lower bytes of the words and then the upper bytes and
+  const __m128i zero = _mm_setzero_si128();
-// then merges the results together.
+  const __m128i signs = _mm_cmpgt_epi8(zero, *x);
-#define SIGNED_SHIFT_N(a, N) {                                                 \
+  const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
-  __m128i t = a;                                                               \
+  const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
-  t = _mm_slli_epi16(t, 8);                                                    \
+  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
-  t = _mm_srai_epi16(t, N);                                                    \
+  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
-  t = _mm_srli_epi16(t, 8);                                                    \
+  *x = _mm_packs_epi16(lo_1, hi_1);
                                                                               \
  a = _mm_srai_epi16(a, N + 8);                                                \
  a = _mm_slli_epi16(a, 8);                                                    \
                                                                               \
  a = _mm_or_si128(t, a);                                                      \
 }
 #define FLIP_SIGN_BIT2(a, b) {                                                 \
@ -276,103 +319,123 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
  FLIP_SIGN_BIT2(c, d);                                                        \
 }
-#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
+// input/output is uint8_t
-  const __m128i zero = _mm_setzero_si128();                                    \
+static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
-  const __m128i t_1 = MM_ABS(p1, p0);                                          \
+                                  const __m128i* const p0,
-  const __m128i t_2 = MM_ABS(q1, q0);                                          \
+                                  const __m128i* const q0,
-                                                                               \
+                                  const __m128i* const q1,
-  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
+                                  int hev_thresh, __m128i* const not_hev) {
-  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
+  const __m128i zero = _mm_setzero_si128();
-  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
+  const __m128i t_1 = MM_ABS(*p1, *p0);
-                                                                               \
+  const __m128i t_2 = MM_ABS(*q1, *q0);
-  not_hev = _mm_or_si128(t_3, t_4);                                            \
+
-  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
+  const __m128i h = _mm_set1_epi8(hev_thresh);
  const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
  const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh
  *not_hev = _mm_or_si128(t_3, t_4);
  *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
 }
-#define GET_BASE_DELTA(p1, p0, q0, q1, o) {                                    \
+// input pixels are int8_t
-  const __m128i qp0 = _mm_subs_epi8(q0, p0);  /* q0 - p0 */                    \
+static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
-  o = _mm_subs_epi8(p1, q1);            /* p1 - q1 */                          \
+                                     const __m128i* const p0,
-  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 1 * (q0 - p0) */          \
+                                     const __m128i* const q0,
-  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 2 * (q0 - p0) */          \
+                                     const __m128i* const q1,
-  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 3 * (q0 - p0) */          \
+                                     __m128i* const delta) {
  // beware of addition order, for saturation!
  const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
  const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
  const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
  const __m128i s2 = _mm_adds_epi8(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
  const __m128i s3 = _mm_adds_epi8(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
  *delta = s3;
 }
-#define DO_SIMPLE_FILTER(p0, q0, fl) {                                         \
+// input and output are int8_t
-  const __m128i three = _mm_set1_epi8(3);                                      \
+static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
-  const __m128i four = _mm_set1_epi8(4);                                       \
+                                       const __m128i* const fl) {
-  __m128i v3 = _mm_adds_epi8(fl, three);                                       \
+  const __m128i k3 = _mm_set1_epi8(3);
-  __m128i v4 = _mm_adds_epi8(fl, four);                                        \
+  const __m128i k4 = _mm_set1_epi8(4);
-                                                                               \
+  __m128i v3 = _mm_adds_epi8(*fl, k3);
-  /* Do +4 side */                                                             \
+  __m128i v4 = _mm_adds_epi8(*fl, k4);
-  SIGNED_SHIFT_N(v4, 3);                /* v4 >> 3  */                         \
+
-  q0 = _mm_subs_epi8(q0, v4);           /* q0 -= v4 */                         \
+  SignedShift8b(&v4);                  // v4 >> 3
-                                                                               \
+  SignedShift8b(&v3);                  // v3 >> 3
-  /* Now do +3 side */                                                         \
+  *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
-  SIGNED_SHIFT_N(v3, 3);                /* v3 >> 3  */                         \
+  *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
  p0 = _mm_adds_epi8(p0, v3);           /* p0 += v3 */                         \
 }
 // Updates values of 2 pixels at MB edge during complex filtering.
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
-#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
+// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
-  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
+static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
-  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
+                                      const __m128i* const a0_lo,
-  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
+                                      const __m128i* const a0_hi) {
-  pi = _mm_adds_epi8(pi, delta);                                               \
+  const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
-  qi = _mm_subs_epi8(qi, delta);                                               \
+  const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
  const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  *pi = _mm_adds_epi8(*pi, delta);
  *qi = _mm_subs_epi8(*qi, delta);
  FLIP_SIGN_BIT2(*pi, *qi);
 }
-static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
+// input pixels are uint8_t
-                        const __m128i* q1, int thresh, __m128i *mask) {
+static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
-  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
+                                    const __m128i* const p0,
-  *mask = _mm_set1_epi8(0xFE);
+                                    const __m128i* const q0,
-  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
+                                    const __m128i* const q1,
-  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2
+                                    int thresh, __m128i* const mask) {
  const __m128i m_thresh = _mm_set1_epi8(thresh);
  const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
  const __m128i kFE = _mm_set1_epi8(0xFE);
  const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
  const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2
-  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
+  const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
-  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
+  const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
-  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+  const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2
-  t1 = _mm_set1_epi8(thresh);
+  const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
-  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
+  *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
 }
 //------------------------------------------------------------------------------
 // Edge filtering functions
 // Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
+static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
-                                  const __m128i* q1, int thresh) {
+                                  __m128i* const q0, __m128i* const q1,
                                  int thresh) {
  __m128i a, mask;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  // convert p1/q1 to int8_t (for GetBaseDelta)
  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
  // convert to signed values
  FLIP_SIGN_BIT2(*p0, *q0);
-
+  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
  GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
-  DO_SIMPLE_FILTER(*p0, *q0, a);
+  DoSimpleFilter(p0, q0, &a);
  // unoffset
  FLIP_SIGN_BIT2(*p0, *q0);
 }
 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
+static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
-                                  __m128i* q0, __m128i* q1,
+                                  __m128i* const q0, __m128i* const q1,
-                                  const __m128i* mask, int hev_thresh) {
+                                  const __m128i* const mask, int hev_thresh) {
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  const __m128i k64 = _mm_set1_epi8(0x40);
  const __m128i zero = _mm_setzero_si128();
  __m128i not_hev;
  __m128i t1, t2, t3;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  // compute hev mask
-  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
  // convert to signed values
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@ -385,92 +448,83 @@ static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
  // Do +4 side
  t2 = _mm_set1_epi8(4);
  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  t3 = t2;                           // save t2
  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
  // Now do +3 side
  t2 = _mm_set1_epi8(3);
-  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
+  t3 = _mm_set1_epi8(4);
-  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
  t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
  *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
  FLIP_SIGN_BIT2(*p0, *q0);
-  t2 = _mm_set1_epi8(1);
+  // this is equivalent to signed (a + 1) >> 1 calculation
-  t3 = _mm_adds_epi8(t3, t2);
+  t2 = _mm_add_epi8(t3, sign_bit);
-  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
+  t3 = _mm_avg_epu8(t2, zero);
  t3 = _mm_sub_epi8(t3, k64);
  t3 = _mm_and_si128(not_hev, t3);   // if !hev
  *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
  *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
-
+  FLIP_SIGN_BIT2(*p1, *q1);
  // unoffset
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
 }
 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
+static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
-                                  __m128i* q0, __m128i* q1, __m128i *q2,
+                                  __m128i* const p0, __m128i* const q0,
-                                  const __m128i* mask, int hev_thresh) {
+                                  __m128i* const q1, __m128i* const q2,
-  __m128i a, not_hev;
+                                  const __m128i* const mask, int hev_thresh) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  __m128i a, not_hev;
  // compute hev mask
-  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
  // convert to signed values
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
-
+  GetBaseDelta(p1, p0, q0, q1, &a);
  GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
  { // do simple filter on pixels with hev
    const __m128i m = _mm_andnot_si128(not_hev, *mask);
    const __m128i f = _mm_and_si128(a, m);
-    DO_SIMPLE_FILTER(*p0, *q0, f);
+    DoSimpleFilter(p0, q0, &f);
  }
  { // do strong filter on pixels with not hev
-    const __m128i zero = _mm_setzero_si128();
+    const __m128i k9 = _mm_set1_epi16(0x0900);
-    const __m128i nine = _mm_set1_epi16(0x0900);
+    const __m128i k63 = _mm_set1_epi16(63);
    const __m128i sixty_three = _mm_set1_epi16(63);
    const __m128i m = _mm_and_si128(not_hev, *mask);
    const __m128i f = _mm_and_si128(a, m);
    const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
    const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
-    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
+    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9);    // Filter (lo) * 9
-    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
+    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9);    // Filter (hi) * 9
    const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
    const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18
-    const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
+    const __m128i a2_lo = _mm_add_epi16(f9_lo, k63);    // Filter * 9 + 63
-    const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63
+    const __m128i a2_hi = _mm_add_epi16(f9_hi, k63);    // Filter * 9 + 63
-    const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
+    const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo);  // Filter * 18 + 63
-    const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63
+    const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi);  // Filter * 18 + 63
-    const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
+    const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
-    const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63
+    const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63
-    UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
+    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
-    UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
+    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
-    UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
+    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
  }
  // unoffset
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
 }
 // reads 8 rows across a vertical edge.
 //
 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
 // two Load4x4() to avoid code duplication.
-static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
+static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
-                                __m128i* p, __m128i* q) {
+                                __m128i* const p, __m128i* const q) {
  __m128i t1, t2;
  // Load 0th, 1st, 4th and 5th rows
@ -509,10 +563,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
  *q = _mm_unpackhi_epi32(t1, t2);
 }
-static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
+static WEBP_INLINE void Load16x4(const uint8_t* const r0,
                                 const uint8_t* const r8,
                                 int stride,
-                                 __m128i* p1, __m128i* p0,
+                                 __m128i* const p1, __m128i* const p0,
-                                 __m128i* q0, __m128i* q1) {
+                                 __m128i* const q0, __m128i* const q1) {
  __m128i t1, t2;
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
@ -544,7 +599,7 @@ static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
  *q1 = _mm_unpackhi_epi64(t2, *q1);
 }
-static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
@ -553,48 +608,51 @@ static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
 }
 // Transpose back and store
-static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
+static WEBP_INLINE void Store16x4(const __m128i* const p1,
-                                  __m128i* p1, __m128i* p0,
+                                  const __m128i* const p0,
-                                  __m128i* q0, __m128i* q1) {
+                                  const __m128i* const q0,
-  __m128i t1;
+                                  const __m128i* const q1,
                                  uint8_t* r0, uint8_t* r8,
                                  int stride) {
  __m128i t1, p1_s, p0_s, q0_s, q1_s;
  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
  t1 = *p0;
-  *p0 = _mm_unpacklo_epi8(*p1, t1);
+  p0_s = _mm_unpacklo_epi8(*p1, t1);
-  *p1 = _mm_unpackhi_epi8(*p1, t1);
+  p1_s = _mm_unpackhi_epi8(*p1, t1);
  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
  t1 = *q0;
-  *q0 = _mm_unpacklo_epi8(t1, *q1);
+  q0_s = _mm_unpacklo_epi8(t1, *q1);
-  *q1 = _mm_unpackhi_epi8(t1, *q1);
+  q1_s = _mm_unpackhi_epi8(t1, *q1);
  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-  t1 = *p0;
+  t1 = p0_s;
-  *p0 = _mm_unpacklo_epi16(t1, *q0);
+  p0_s = _mm_unpacklo_epi16(t1, q0_s);
-  *q0 = _mm_unpackhi_epi16(t1, *q0);
+  q0_s = _mm_unpackhi_epi16(t1, q0_s);
  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-  t1 = *p1;
+  t1 = p1_s;
-  *p1 = _mm_unpacklo_epi16(t1, *q1);
+  p1_s = _mm_unpacklo_epi16(t1, q1_s);
-  *q1 = _mm_unpackhi_epi16(t1, *q1);
+  q1_s = _mm_unpackhi_epi16(t1, q1_s);
-  Store4x4(p0, r0, stride);
+  Store4x4(&p0_s, r0, stride);
  r0 += 4 * stride;
-  Store4x4(q0, r0, stride);
+  Store4x4(&q0_s, r0, stride);
-  Store4x4(p1, r8, stride);
+  Store4x4(&p1_s, r8, stride);
  r8 += 4 * stride;
-  Store4x4(q1, r8, stride);
+  Store4x4(&q1_s, r8, stride);
 }
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
-static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  // Load
  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
@ -605,49 +663,49 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
  // Store
  _mm_storeu_si128((__m128i*)&p[-stride], p0);
-  _mm_storeu_si128((__m128i*)p, q0);
+  _mm_storeu_si128((__m128i*)&p[0], q0);
 }
-static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  __m128i p1, p0, q0, q1;
  p -= 2;  // beginning of p1
  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
  DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
 }
-static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    SimpleVFilter16SSE2(p, stride, thresh);
+    SimpleVFilter16(p, stride, thresh);
  }
 }
-static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    SimpleHFilter16SSE2(p, stride, thresh);
+    SimpleHFilter16(p, stride, thresh);
  }
 }
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
-#define MAX_DIFF1(p3, p2, p1, p0, m) {                                         \
+#define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
-  m = MM_ABS(p3, p2);                                                          \
+  m = MM_ABS(p1, p0);                                                          \
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
 }
 #define MAX_DIFF2(p3, p2, p1, p0, m) {                                         \
  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
 } while (0)
 #define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
-}
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
 } while (0)
 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
@ -656,10 +714,11 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
 }
-#define LOADUV_H_EDGE(p, u, v, stride) {                                       \
+#define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
-  p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                               \
+  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
-  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)]));        \
+  const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
-}
+  p = _mm_unpacklo_epi64(U, V);                                                \
 } while (0)
 #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
@ -674,17 +733,22 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
 }
-#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) {               \
+static WEBP_INLINE void ComplexMask(const __m128i* const p1,
-  __m128i fl_yes;                                                              \
+                                    const __m128i* const p0,
-  const __m128i it = _mm_set1_epi8(ithresh);                                   \
+                                    const __m128i* const q0,
-  mask = _mm_subs_epu8(mask, it);                                              \
+                                    const __m128i* const q1,
-  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                            \
+                                    int thresh, int ithresh,
-  NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes);                            \
+                                    __m128i* const mask) {
-  mask = _mm_and_si128(mask, fl_yes);                                          \
+  const __m128i it = _mm_set1_epi8(ithresh);
  const __m128i diff = _mm_subs_epu8(*mask, it);
  const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
  __m128i filter_mask;
  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
  *mask = _mm_and_si128(thresh_mask, filter_mask);
 }
 // on macroblock edges
-static void VFilter16SSE2(uint8_t* p, int stride,
+static void VFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  __m128i t1;
  __m128i mask;
@ -698,19 +762,19 @@ static void VFilter16SSE2(uint8_t* p, int stride,
  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
  // Store
  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
  _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
  _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
-  _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+  _mm_storeu_si128((__m128i*)&p[+0 * stride], q0);
-  _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+  _mm_storeu_si128((__m128i*)&p[+1 * stride], q1);
-  _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
+  _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
 }
-static void HFilter16SSE2(uint8_t* p, int stride,
+static void HFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
@ -722,70 +786,77 @@ static void HFilter16SSE2(uint8_t* p, int stride,
  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
  MAX_DIFF2(q3, q2, q1, q0, mask);
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
-  Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
+  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
-  Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
+  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
 }
 // on three inner edges
-static void VFilter16iSSE2(uint8_t* p, int stride,
+static void VFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
-  __m128i mask;
+  __m128i p3, p2, p1, p0;   // loop invariants
-  __m128i t1, t2, p1, p0, q0, q1;
+
  LOAD_H_EDGES4(p, stride, p3, p2, p1, p0);  // prologue
  for (k = 3; k > 0; --k) {
-    // Load p3, p2, p1, p0
+    __m128i mask, tmp1, tmp2;
-    LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
+    uint8_t* const b = p + 2 * stride;   // beginning of p1
    MAX_DIFF1(t2, t1, p1, p0, mask);
    p += 4 * stride;
-    // Load q0, q1, q2, q3
+    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
-    LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
+    LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
-    MAX_DIFF2(t2, t1, q1, q0, mask);
+    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
-    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+    // p3 and p2 are not just temporary variables here: they will be
-    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+    // re-used for next span. And q2/q3 will become p1/p0 accordingly.
    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
    // Store
-    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+    _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
-    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+    _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
-    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+    _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
-    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+    _mm_storeu_si128((__m128i*)&b[3 * stride], p2);
    // rotate samples
    p1 = tmp1;
    p0 = tmp2;
  }
 }
-static void HFilter16iSSE2(uint8_t* p, int stride,
+static void HFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
-  uint8_t* b;
+  __m128i p3, p2, p1, p0;   // loop invariants
-  __m128i mask;
+
-  __m128i t1, t2, p1, p0, q0, q1;
+  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
  for (k = 3; k > 0; --k) {
-    b = p;
+    __m128i mask, tmp1, tmp2;
-    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
+    uint8_t* const b = p + 2;   // beginning of p1
    MAX_DIFF1(t2, t1, p1, p0, mask);
-    b += 4;  // beginning of q0
+    p += 4;  // beginning of q0 (and next span)
    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
    MAX_DIFF2(t2, t1, q1, q0, mask);
-    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
-    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
-    b -= 2;  // beginning of p1
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
-    p += 4;
+    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
    // rotate samples
    p1 = tmp1;
    p0 = tmp2;
  }
 }
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, p2, p1, p0, q0, q1, q2;
@ -798,7 +869,7 @@ static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
  // Store
@ -810,7 +881,7 @@ static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q2, u, v, 2 * stride);
 }
-static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
@ -823,14 +894,14 @@ static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
  MAX_DIFF2(q3, q2, q1, q0, mask);
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
-  Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
+  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
-  Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
+  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
 }
-static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
@ -846,7 +917,7 @@ static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
  MAX_DIFF2(t2, t1, q1, q0, mask);
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
  // Store
@ -856,7 +927,7 @@ static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q1, u, v, 1 * stride);
 }
-static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
@ -868,12 +939,12 @@ static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
  MAX_DIFF2(t2, t1, q1, q0, mask);
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
  u -= 2;  // beginning of p1
  v -= 2;
-  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
+  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
 }
 #endif   // WEBP_USE_SSE2
@ -885,24 +956,23 @@ extern void VP8DspInitSSE2(void);
 void VP8DspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
-  VP8Transform = TransformSSE2;
+  VP8Transform = Transform;
 #if defined(USE_TRANSFORM_AC3)
  VP8TransformAC3 = TransformAC3;
 #endif
-  VP8VFilter16 = VFilter16SSE2;
+  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16SSE2;
+  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8SSE2;
+  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8SSE2;
+  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16iSSE2;
+  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16iSSE2;
+  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8iSSE2;
+  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8iSSE2;
+  VP8HFilter8i = HFilter8i;
-  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
+  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
+  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
 #endif   // WEBP_USE_SSE2
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   Speed-critical functions.
@ -12,35 +14,76 @@
 #ifndef WEBP_DSP_DSP_H_
 #define WEBP_DSP_DSP_H_
 #ifdef HAVE_CONFIG_H
 #include "../webp/config.h"
 #endif
 #include "../webp/types.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // CPU detection
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if defined(__GNUC__)
 # define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
 # define LOCAL_GCC_PREREQ(maj, min) \
    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
 #else
 # define LOCAL_GCC_VERSION 0
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 #ifdef __clang__
 # define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
 # define LOCAL_CLANG_PREREQ(maj, min) \
    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
 #else
 # define LOCAL_CLANG_VERSION 0
 # define LOCAL_CLANG_PREREQ(maj, min) 0
 #endif  // __clang__
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2)
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
 // set so should succeed on one of the earlier tests.
 #if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
 #define WEBP_USE_SSE2
 #endif
 #if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
 #define WEBP_USE_AVX2
 #endif
 #if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
 #define WEBP_ANDROID_NEON  // Android targets that might support NEON
 #endif
-#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)
+#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || defined(__aarch64__)
 #define WEBP_USE_NEON
 #endif
 #if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
 #endif
 #endif
 typedef enum {
  kSSE2,
  kSSE3,
-  kNEON
+  kAVX,
  kAVX2,
  kNEON,
  kMIPS32
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
@ -58,7 +101,6 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
 typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
 extern VP8WHT VP8ITransformWHT;
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
@ -80,9 +122,14 @@ extern VP8BlockCopy VP8Copy4x4;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                int n, const struct VP8Matrix* const mtx);
+                                const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 // specific to 2nd transform:
 typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
                                   const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 // Collect histogram for susceptibility calculation and accumulate in histo[].
 struct VP8Histogram;
 typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
@ -100,6 +147,7 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
@ -112,6 +160,13 @@ extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
 extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
 extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
 // clipping tables (for filtering)
 extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
 extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
 extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
 extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
 void VP8InitClipTables(void);           // must be called first
 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
 extern VP8SimpleFilterFunc VP8SimpleVFilter16;
@ -144,6 +199,8 @@ void VP8DspInit(void);
 #define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
 // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
 // bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
    const uint8_t* top_y, const uint8_t* bottom_y,
    const uint8_t* top_u, const uint8_t* top_v,
@ -155,21 +212,20 @@ typedef void (*WebPUpsampleLinePairFunc)(
 // Fancy upsampling functions to convert YUV to RGB(A) modes
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 // Initializes SSE2 version of the fancy upsamplers.
 void WebPInitUpsamplersSSE2(void);
 // NEON version
 void WebPInitUpsamplersNEON(void);
 #endif    // FANCY_UPSAMPLING
-// Point-sampling methods.
+// Per-row point-sampling methods.
-typedef void (*WebPSampleLinePairFunc)(
+typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
    const uint8_t* top_y, const uint8_t* bottom_y,
                                   const uint8_t* u, const uint8_t* v,
-    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+                                   uint8_t* dst, int len);
 // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
 void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
                             const uint8_t* u, const uint8_t* v, int uv_stride,
                             uint8_t* dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func);
-extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
+// Sampling functions to convert rows of YUV to RGB(A)
 extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
 // General function for converting two lines of ARGB or RGBA.
 // 'alpha_is_last' should be true if 0xff000000 is stored in memory as
@ -183,11 +239,14 @@ typedef void (*WebPYUV444Converter)(const uint8_t* y,
 extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
-// Main function to be called
+// Must be called before using the WebPUpsamplers[] (and for premultiplied
 // colorspaces like rgbA, rgbA4444, etc)
 void WebPInitUpsamplers(void);
 // Must be called before using WebPSamplers[]
 void WebPInitSamplers(void);
 //------------------------------------------------------------------------------
-// Pre-multiply planes with alpha values
+// Utilities for processing transparent channel.
 // Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
 // alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
@ -198,15 +257,36 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
    uint8_t* rgba4444, int w, int h, int stride);
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
 extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
                               int width, int height,
                               uint8_t* alpha, int alpha_stride);
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
 // Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
 extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
 // Same a WebPMultARGBRow(), but for several rows.
 void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                      int inverse);
 // Same for a row of single values, with side alpha values.
 extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
                           int width, int inverse);
 // Same a WebPMultRow(), but for several 'num_rows' rows.
 void WebPMultRows(uint8_t* ptr, int stride,
                  const uint8_t* alpha, int alpha_stride,
                  int width, int num_rows, int inverse);
 // To be called first before using the above.
-void WebPInitPremultiply(void);
+void WebPInitAlphaProcessing(void);
-void WebPInitPremultiplySSE2(void);   // should not be called directly.
+#ifdef __cplusplus
 void WebPInitPremultiplyNEON(void);
 //------------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -1,22 +1,22 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical encoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 #include "./dsp.h"
 #include "../enc/vp8enci.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
@ -142,9 +142,9 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
    const int a1 = (d1 + d2);
    const int a2 = (d1 - d2);
    const int a3 = (d0 - d3);
-    tmp[0 + i * 4] = (a0 + a1) << 3;  // 14b                      [-8160,8160]
+    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
-    tmp[2 + i * 4] = (a0 - a1) << 3;
+    tmp[2 + i * 4] = (a0 - a1) * 8;
    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
  }
  for (i = 0; i < 4; ++i) {
@ -159,59 +159,33 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  }
 }
 static void ITransformWHT(const int16_t* in, int16_t* out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
    const int a0 = in[0 + i] + in[12 + i];
    const int a1 = in[4 + i] + in[ 8 + i];
    const int a2 = in[4 + i] - in[ 8 + i];
    const int a3 = in[0 + i] - in[12 + i];
    tmp[0  + i] = a0 + a1;
    tmp[8  + i] = a0 - a1;
    tmp[4  + i] = a3 + a2;
    tmp[12 + i] = a3 - a2;
  }
  for (i = 0; i < 4; ++i) {
    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
    const int a0 = dc             + tmp[3 + i * 4];
    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
    const int a3 = dc             - tmp[3 + i * 4];
    out[ 0] = (a0 + a1) >> 3;
    out[16] = (a3 + a2) >> 3;
    out[32] = (a0 - a1) >> 3;
    out[48] = (a3 - a2) >> 3;
    out += 64;
  }
 }
 static void FTransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
+  // input is 12b signed
  int32_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
+    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
-    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
+    const int a1 = (in[1 * 16] + in[3 * 16]);
-    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
+    const int a2 = (in[1 * 16] - in[3 * 16]);
-    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
+    const int a3 = (in[0 * 16] - in[2 * 16]);
-    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+    tmp[0 + i * 4] = a0 + a1;   // 14b
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[12+ i]);
    const int a2 = (tmp[4 + i] - tmp[12+ i]);
    const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;
+    const int b0 = a0 + a1;    // 16b
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
+    out[ 0 + i] = b0 >> 1;     // 15b
-    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
+    out[ 4 + i] = b1 >> 1;
-    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
+    out[ 8 + i] = b2 >> 1;
-    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+    out[12 + i] = b3 >> 1;
  }
 }
@ -626,21 +600,49 @@ static const uint8_t kZigzag[16] = {
 // Simple quantization
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         int n, const VP8Matrix* const mtx) {
+                         const VP8Matrix* const mtx) {
  int last = -1;
-  for (; n < 16; ++n) {
+  int n;
  for (n = 0; n < 16; ++n) {
    const int j = kZigzag[n];
    const int sign = (in[j] < 0);
-    const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
+      const uint32_t Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
+      const uint32_t iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
+      const uint32_t B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
+      int level = QUANTDIV(coeff, iQ, B);
-      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
-      if (sign) out[n] = -out[n];
+      if (sign) level = -level;
-      in[j] = out[n] * Q;
+      in[j] = level * Q;
-      if (out[n]) last = n;
+      out[n] = level;
      if (level) last = n;
    } else {
      out[n] = 0;
      in[j] = 0;
    }
  }
  return (last >= 0);
 }
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                            const VP8Matrix* const mtx) {
  int n, last = -1;
  for (n = 0; n < 16; ++n) {
    const int j = kZigzag[n];
    const int sign = (in[j] < 0);
    const uint32_t coeff = sign ? -in[j] : in[j];
    assert(mtx->sharpen_[j] == 0);
    if (coeff > mtx->zthresh_[j]) {
      const uint32_t Q = mtx->q_[j];
      const uint32_t iQ = mtx->iq_[j];
      const uint32_t B = mtx->bias_[j];
      int level = QUANTDIV(coeff, iQ, B);
      if (level > MAX_LEVEL) level = MAX_LEVEL;
      if (sign) level = -level;
      in[j] = level * Q;
      out[n] = level;
      if (level) last = n;
    } else {
      out[n] = 0;
      in[j] = 0;
@ -671,7 +673,6 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 VP8CHisto VP8CollectHistogram;
 VP8Idct VP8ITransform;
 VP8Fdct VP8FTransform;
 VP8WHT VP8ITransformWHT;
 VP8WHT VP8FTransformWHT;
 VP8Intra4Preds VP8EncPredLuma4;
 VP8IntraPreds VP8EncPredLuma16;
@ -683,19 +684,27 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
 extern void VP8EncDspInitSSE2(void);
 extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
 static volatile VP8CPUInfo enc_last_cpuinfo_used =
    (VP8CPUInfo)&enc_last_cpuinfo_used;
 void VP8EncDspInit(void) {
  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
  VP8DspInit();  // common inverse transforms
  InitTables();
  // default C implementations
  VP8CollectHistogram = CollectHistogram;
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
  VP8ITransformWHT = ITransformWHT;
  VP8FTransformWHT = FTransformWHT;
  VP8EncPredLuma4 = Intra4Preds;
  VP8EncPredLuma16 = Intra16Preds;
@ -707,22 +716,32 @@ void VP8EncDspInit(void) {
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8EncQuantizeBlock = QuantizeBlock;
  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8Copy4x4 = Copy4x4;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo) {
+  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspInitSSE2();
    }
-#elif defined(WEBP_USE_NEON)
+#endif
 #if defined(WEBP_USE_AVX2)
    if (VP8GetCPUInfo(kAVX2)) {
      VP8EncDspInitAVX2();
    }
 #endif
 #if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8EncDspInitNEON();
    }
 #endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
    }
 #endif
  }
  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/enc_avx2.c
+++ b/src/dsp/enc_avx2.c
@ -0,0 +1,24 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // AVX2 version of speed-critical encoding functions.
 #include "./dsp.h"
 #if defined(WEBP_USE_AVX2)
 #endif  // WEBP_USE_AVX2
 //------------------------------------------------------------------------------
 // Entry point
 extern void VP8EncDspInitAVX2(void);
 void VP8EncDspInitAVX2(void) {
 }
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -0,0 +1,776 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MIPS version of speed-critical encoding functions.
 //
 // Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
 //            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 //            Slobodan Prijic  (slobodan.prijic@imgtec.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_MIPS32)
 #include "../enc/vp8enci.h"
 #include "../enc/cost.h"
 #if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
 #define WORK_AROUND_GCC
 #endif
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 // macro for one vertical pass in ITransformOne
 // MUL macro inlined
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from in buffer
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
 #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
  "sra     %[temp18],      %[temp18],      16              \n\n"            \
  "sra     %[temp17],      %[temp17],      16              \n\n"            \
  "sra     %[temp19],      %[temp19],      16              \n\n"            \
  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
 // macro for one horizontal pass in ITransformOne
 // MUL and STORE macros inlined
 // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
 #define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
  "lw      %[temp20],      0(%[args])                      \n\t"            \
  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
  "addiu   %[temp20],      $zero,          255             \n\t"            \
  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
  "lw      %[temp16],      8(%[args])                      \n\t"            \
  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                                      uint8_t* dst) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
  const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
  __asm__ volatile(
    "lw      %[temp20],      4(%[args])                      \n\t"
    VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3)
    VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7)
    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
    : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
    : "memory", "hi", "lo"
  );
 }
 static void ITransform(const uint8_t* ref, const int16_t* in,
                       uint8_t* dst, int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 // macro for one pass through for loop in QuantizeBlock
 // QUANTDIV macro inlined
 // J - offset in bytes (kZigzag[n] * 2)
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 #define QUANTIZE_ONE(J, K, N)                                               \
  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
  "sra          %[sign],        %[temp0],           15              \n\t"   \
  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
  "addiu        %[temp5],       $zero,              0               \n\t"   \
  "addiu        %[level],       $zero,              0               \n\t"   \
  "beqz         %[temp4],       2f                                  \n\t"   \
  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
  "sra          %[level],       %[level],           17              \n\t"   \
  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
  "xor          %[level],       %[level],           %[sign]         \n\t"   \
  "subu         %[level],       %[level],           %[sign]         \n\t"   \
  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
 "2:                                                                 \n\t"   \
  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
  "sh           %[level],       "#N"(%[pout])                       \n\t"
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int sign, coeff, level, i;
  int max_level = MAX_LEVEL;
  int16_t* ppin             = &in[0];
  int16_t* pout             = &out[0];
  const uint16_t* ppsharpen = &mtx->sharpen_[0];
  const uint32_t* ppzthresh = &mtx->zthresh_[0];
  const uint16_t* ppq       = &mtx->q_[0];
  const uint16_t* ppiq      = &mtx->iq_[0];
  const uint32_t* ppbias    = &mtx->bias_[0];
  __asm__ volatile(
    QUANTIZE_ONE( 0,  0,  0)
    QUANTIZE_ONE( 2,  4,  2)
    QUANTIZE_ONE( 8, 16,  4)
    QUANTIZE_ONE(16, 32,  6)
    QUANTIZE_ONE(10, 20,  8)
    QUANTIZE_ONE( 4,  8, 10)
    QUANTIZE_ONE( 6, 12, 12)
    QUANTIZE_ONE(12, 24, 14)
    QUANTIZE_ONE(18, 36, 16)
    QUANTIZE_ONE(24, 48, 18)
    QUANTIZE_ONE(26, 52, 20)
    QUANTIZE_ONE(20, 40, 22)
    QUANTIZE_ONE(14, 28, 24)
    QUANTIZE_ONE(22, 44, 26)
    QUANTIZE_ONE(28, 56, 28)
    QUANTIZE_ONE(30, 60, 30)
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
      [level]"=&r"(level)
    : [pout]"r"(pout), [ppin]"r"(ppin),
      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
    : "memory", "hi", "lo"
  );
  // moved out from macro to increase possibility for earlier breaking
  for (i = 15; i >= 0; i--) {
    if (out[i]) return 1;
  }
  return 0;
 }
 #undef QUANTIZE_ONE
 // macro for one horizontal pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
 // A..D - offsets in bytes to load from a and b buffers
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
 #define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
 // since only one accu is available in mips32r1 instruction set
 //   first is done second call of function TTransform and after
 //   that first one.
 //   const int sum1 = TTransform(a, w);
 //   const int sum2 = TTransform(b, w);
 //   return abs(sum2 - sum1) >> 5;
 //   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
 // A..D - offsets in bytes to load first results from tmp buffer
 // A1..D1 - offsets in bytes to load second results from tmp buffer
 // E..H - offsets in bytes to load from w buffer
 #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
  "sra    %[temp4],  %[temp3],    31         \n\t"                \
  "sra    %[temp5],  %[temp1],    31         \n\t"                \
  "sra    %[temp6],  %[temp0],    31         \n\t"                \
  "sra    %[temp7],  %[temp8],    31         \n\t"                \
  "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \
  "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \
  "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \
  "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \
  "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \
  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
  "madd   %[temp4],  %[temp3]                \n\t"                \
  "madd   %[temp5],  %[temp1]                \n\t"                \
  "madd   %[temp6],  %[temp0]                \n\t"                \
  "madd   %[temp7],  %[temp8]                \n\t"                \
  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
  "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
  "sra    %[temp2],  %[temp3],    31         \n\t"                \
  "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \
  "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \
  "msub   %[temp4],  %[temp3]                \n\t"                \
  "sra    %[temp2],  %[temp8],    31         \n\t"                \
  "sra    %[temp3],  %[temp0],    31         \n\t"                \
  "sra    %[temp4],  %[temp1],    31         \n\t"                \
  "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \
  "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \
  "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \
  "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \
  "msub   %[temp5],  %[temp8]                \n\t"                \
  "msub   %[temp6],  %[temp0]                \n\t"                \
  "msub   %[temp7],  %[temp1]                \n\t"
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
  int tmp[32];
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  __asm__ volatile(
    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
    "mthi   $zero                             \n\t"
    "mtlo   $zero                             \n\t"
    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
    VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26)
    VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28)
    VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30)
    "mflo   %[temp0]                          \n\t"
    "sra    %[temp1],  %[temp0],  31          \n\t"
    "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"
    "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"
    "sra    %[temp0],  %[temp0],  5           \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
    : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
    : "memory", "hi", "lo"
  );
  return temp0;
 }
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
 }
 // macro for one horizontal pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 #define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to store to out buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
 #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
  int temp17, temp18, temp19, temp20;
  const int c2217 = 2217;
  const int c5352 = 5352;
  const int* const args[3] =
      { (const int*)src, (const int*)ref, (const int*)out };
  __asm__ volatile(
    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
    "lw   %[temp20],    8(%[args])                     \n\t"
    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
    : "memory", "hi", "lo"
  );
 }
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 // Forward declaration.
 extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
 int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  int p0 = res->prob[n][ctx0][0];
  const uint16_t* t = res->cost[n][ctx0];
  int cost;
  const int const_2 = 2;
  const int const_255 = 255;
  const int const_max_level = MAX_VARIABLE_LEVEL;
  int res_cost;
  int res_prob;
  int res_coeffs;
  int res_last;
  int v_reg;
  int b_reg;
  int ctx_reg;
  int cost_add, temp_1, temp_2, temp_3;
  if (res->last < 0) {
    return VP8BitCost(0, p0);
  }
  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
  res_cost = (int)res->cost;
  res_prob = (int)res->prob;
  res_coeffs = (int)res->coeffs;
  res_last = (int)res->last;
  __asm__ volatile(
    ".set   push                                                           \n\t"
    ".set   noreorder                                                      \n\t"
    "sll    %[temp_1],     %[n],              1                            \n\t"
    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
    "bnez   %[temp_2],     1f                                              \n\t"
    " li    %[cost_add],   0                                               \n\t"
    "b      2f                                                             \n\t"
    " nop                                                                  \n\t"
  "1:                                                                      \n\t"
    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
    "move   %[temp_1],     %[const_max_level]                              \n\t"
    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
    "negu   %[temp_2],     %[v_reg]                                        \n\t"
    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
    "li     %[cost_add],   0                                               \n\t"
    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
    //  cost += VP8LevelCost(t, v);
    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
    //  t = res->cost[b][ctx];
    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
    "addiu  %[n],          %[n],              1                            \n\t"
    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
    "bnez   %[temp_1],     1b                                              \n\t"
    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
   "2:                                                                     \n\t"
    ".set   pop                                                            \n\t"
    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
      [res_cost]"r"(res_cost)
    : "memory"
  );
  // Last coefficient is always non-zero
  {
    const int v = abs(res->coeffs[n]);
    assert(v != 0);
    cost += VP8LevelCost(t, v);
    if (n < 15) {
      const int b = VP8EncBands[n + 1];
      const int ctx = (v == 1) ? 1 : 2;
      const int last_p0 = res->prob[b][ctx][0];
      cost += VP8BitCost(0, last_p0);
    }
  }
  return cost;
 }
 #define GET_SSE_INNER(A, B, C, D)                               \
  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
  "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \
  "madd    %[temp0],    %[temp0]                     \n\t"      \
  "madd    %[temp2],    %[temp2]                     \n\t"      \
  "madd    %[temp4],    %[temp4]                     \n\t"      \
  "madd    %[temp6],    %[temp6]                     \n\t"
 #define GET_SSE(A, B, C, D)               \
  GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \
  GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
 #if !defined(WORK_AROUND_GCC)
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"
     GET_SSE(  0,   4,   8,  12)
     GET_SSE( 16,  20,  24,  28)
     GET_SSE( 32,  36,  40,  44)
     GET_SSE( 48,  52,  56,  60)
     GET_SSE( 64,  68,  72,  76)
     GET_SSE( 80,  84,  88,  92)
     GET_SSE( 96, 100, 104, 108)
     GET_SSE(112, 116, 120, 124)
     GET_SSE(128, 132, 136, 140)
     GET_SSE(144, 148, 152, 156)
     GET_SSE(160, 164, 168, 172)
     GET_SSE(176, 180, 184, 188)
     GET_SSE(192, 196, 200, 204)
     GET_SSE(208, 212, 216, 220)
     GET_SSE(224, 228, 232, 236)
     GET_SSE(240, 244, 248, 252)
    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
    : "memory", "hi" , "lo"
  );
  return count;
 }
 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"
     GET_SSE(  0,   4,   8,  12)
     GET_SSE( 16,  20,  24,  28)
     GET_SSE( 32,  36,  40,  44)
     GET_SSE( 48,  52,  56,  60)
     GET_SSE( 64,  68,  72,  76)
     GET_SSE( 80,  84,  88,  92)
     GET_SSE( 96, 100, 104, 108)
     GET_SSE(112, 116, 120, 124)
    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
    : "memory", "hi" , "lo"
  );
  return count;
 }
 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"
     GET_SSE( 0,   4,  16,  20)
     GET_SSE(32,  36,  48,  52)
     GET_SSE(64,  68,  80,  84)
     GET_SSE(96, 100, 112, 116)
    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
    : "memory", "hi" , "lo"
  );
  return count;
 }
 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"
     GET_SSE(0, 16, 32, 48)
    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
    : "memory", "hi" , "lo"
  );
  return count;
 }
 #endif  // WORK_AROUND_GCC
 #undef GET_SSE_MIPS32
 #undef GET_SSE_MIPS32_INNER
 #endif  // WEBP_USE_MIPS32
 //------------------------------------------------------------------------------
 // Entry point
 extern void VP8EncDspInitMIPS32(void);
 void VP8EncDspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
  VP8ITransform = ITransform;
  VP8EncQuantizeBlock = QuantizeBlock;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8FTransform = FTransform;
 #if !defined(WORK_AROUND_GCC)
  VP8SSE16x16 = SSE16x16;
  VP8SSE8x8 = SSE8x8;
  VP8SSE16x8 = SSE16x8;
  VP8SSE4x4 = SSE4x4;
 #endif
 #endif  // WEBP_USE_MIPS32
 }
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // ARM NEON version of speed-critical encoding functions.
@ -11,24 +13,124 @@
 #include "./dsp.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #if defined(WEBP_USE_NEON)
 #include <assert.h>
 #include "./neon.h"
 #include "../enc/vp8enci.h"
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 // Inverse transform.
-// This code is pretty much the same as TransformOneNEON in the decoder, except
+// This code is pretty much the same as TransformOne in the dec_neon.c, except
 // for subtraction to *ref. See the comments there for algorithmic explanations.
 static const int16_t kC1 = 20091;
 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 // This code works but is *slower* than the inlined-asm version below
 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
 // USE_INTRINSICS define.
 // With gcc-4.8, it's a little faster speed than inlined-assembly.
 #if defined(USE_INTRINSICS)
 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
 }
 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
                                            const int16x8_t dst01,
                                            const int16x8_t dst23) {
  // Unsigned saturate to 8b.
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
  // Store the results.
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }
 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
                               const uint8_t* const ref, uint8_t* const dst) {
  uint32x2_t dst01 = vdup_n_u32(0);
  uint32x2_t dst23 = vdup_n_u32(0);
  // Load the source pixels.
  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
  {
    // Convert to 16b.
    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
    // Descale with rounding.
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    // Add the inverse transform.
    SaturateAndStore4x4(dst, out01, out23);
  }
 }
 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
                                     int16x8x2_t* const out) {
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
                                                  // b0 d0 b1 d1 b2 d2 ...
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }
 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
  // {rows} = in0 | in4
  //          in8 | in12
  // B1 = in4 | in12
  const int16x8_t B1 =
      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
  // C0 = kC1 * in4 | kC1 * in12
  // C1 = kC2 * in4 | kC2 * in12
  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
                                vget_low_s16(rows->val[1]));   // in0 + in8
  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
                                vget_low_s16(rows->val[1]));   // in0 - in8
  // c = kC2 * in4 - kC1 * in12
  // d = kC1 * in4 + kC2 * in12
  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
  Transpose8x2(E0, E1, rows);
 }
 static void ITransformOne(const uint8_t* ref,
                          const int16_t* in, uint8_t* dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
  TransformPass(&rows);
  TransformPass(&rows);
  Add4x4(rows.val[0], rows.val[1], ref, dst);
 }
 #else
 static void ITransformOne(const uint8_t* ref,
                          const int16_t* in, uint8_t* dst) {
  const int kBPS = BPS;
-  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
+  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
  __asm__ volatile (
    "vld1.16         {q1, q2}, [%[in]]           \n"
@ -139,6 +241,8 @@ static void ITransformOne(const uint8_t* ref,
  );
 }
 #endif    // USE_INTRINSICS
 static void ITransform(const uint8_t* ref,
                       const int16_t* in, uint8_t* dst, int do_two) {
  ITransformOne(ref, in, dst);
@ -147,76 +251,102 @@ static void ITransform(const uint8_t* ref,
  }
 }
-// Same code as dec_neon.c
+// Load all 4x4 pixels into a single uint8x16_t variable.
-static void ITransformWHT(const int16_t* in, int16_t* out) {
+static uint8x16_t Load4x4(const uint8_t* src) {
-  const int kStep = 32;  // The store is only incrementing the pointer as if we
+  uint32x4_t out = vdupq_n_u32(0);
-                         // had stored a single byte.
+  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
-  __asm__ volatile (
+  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
-    // part 1
+  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
-    // load data into q0, q1
+  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
-    "vld1.16         {q0, q1}, [%[in]]           \n"
+  return vreinterpretq_u8_u32(out);
    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
    // Transpose
    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
    "vtrn.32         q0, q1                      \n"
    "vtrn.32         q2, q3                      \n"
    "vmov.s32        q4, #3                      \n" // dc = 3
    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
    "vadd.s32        q0, q6, q7                  \n"
    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
    "vadd.s32        q1, q9, q8                  \n"
    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
    "vsub.s32        q2, q6, q7                  \n"
    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
    "vsub.s32        q3, q9, q8                  \n"
    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
    // set the results to output
    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
    : [out] "+r"(out)  // modified registers
    : [in] "r"(in), [kStep] "r"(kStep)  // constants
    : "memory", "q0", "q1", "q2", "q3", "q4",
      "q5", "q6", "q7", "q8", "q9" // clobbered
  );
 }
 // Forward transform.
 #if defined(USE_INTRINSICS)
 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
                                         const int16x4_t C, const int16x4_t D,
                                         int16x8_t* const out01,
                                         int16x8_t* const out32) {
  const int16x4x2_t AB = vtrn_s16(A, B);
  const int16x4x2_t CD = vtrn_s16(C, D);
  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
                                     vreinterpret_s32_s16(CD.val[0]));
  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
                                     vreinterpret_s32_s16(CD.val[1]));
  *out01 = vreinterpretq_s16_s64(
      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
                   vreinterpret_s64_s32(tmp13.val[0])));
  *out32 = vreinterpretq_s16_s64(
      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
                   vreinterpret_s64_s32(tmp02.val[1])));
 }
 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
                                         const uint8x8_t b) {
  return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }
 static void FTransform(const uint8_t* src, const uint8_t* ref,
                       int16_t* out) {
  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
  {
    const uint8x16_t S0 = Load4x4(src);
    const uint8x16_t R0 = Load4x4(ref);
    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
    const int16x4_t D0 = vget_low_s16(D0D1);
    const int16x4_t D1 = vget_high_s16(D0D1);
    const int16x4_t D2 = vget_low_s16(D2D3);
    const int16x4_t D3 = vget_high_s16(D2D3);
    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
  }
  {    // 1rst pass
    const int32x4_t kCst937 = vdupq_n_s32(937);
    const int32x4_t kCst1812 = vdupq_n_s32(1812);
    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
                                    vget_high_s16(a0a1_2));
    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
                                    vget_high_s16(a0a1_2));
    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
  }
  {    // 2nd pass
    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
    const int32x4_t kCst51000 = vdupq_n_s32(51000);
    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
    const int16x4_t a3_eq_0 =
        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
    vst1_s16(out +  0, out0);
    vst1_s16(out +  4, out1);
    vst1_s16(out +  8, out2);
    vst1_s16(out + 12, out3);
  }
 }
 #else
 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
 static const int16_t kCoeff16[] = {
  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
@ -322,7 +452,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
-    "vmvn.s16        d4, d4                   \n"
+    "vmvn            d4, d4                   \n" // !(d1 == 0)
    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
    "vshrn.s32       d1, q11, #16             \n"
    // op[4] += (d1!=0)
@ -341,93 +471,76 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
  );
 }
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+#endif
  const int kStep = 32;
  __asm__ volatile (
    // d0 = in[0 * 16] , d1 = in[1 * 16]
    // d2 = in[2 * 16] , d3 = in[3 * 16]
    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
-    "vaddl.s16       q2, d0, d2                 \n"
+#define LOAD_LANE_16b(VALUE, LANE) do {             \
-    "vshl.s32        q2, q2, #2                 \n" // a0=(in[0*16]+in[2*16])<<2
+  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
-    "vaddl.s16       q3, d1, d3                 \n"
+  src += stride;                                    \
-    "vshl.s32        q3, q3, #2                 \n" // a1=(in[1*16]+in[3*16])<<2
+} while (0)
    "vsubl.s16       q4, d1, d3                 \n"
    "vshl.s32        q4, q4, #2                 \n" // a2=(in[1*16]-in[3*16])<<2
    "vsubl.s16       q5, d0, d2                 \n"
    "vshl.s32        q5, q5, #2                 \n" // a3=(in[0*16]-in[2*16])<<2
-    "vceq.s32        q10, q2, #0                \n"
+static void FTransformWHT(const int16_t* src, int16_t* out) {
-    "vmvn.s32        q10, q10                   \n" // (a0 != 0)
+  const int stride = 16;
-    "vqadd.s32       q6, q2, q3                 \n" // (a0 + a1)
+  const int16x4_t zero = vdup_n_s16(0);
-    "vqsub.s32       q6, q6, q10                \n" // (a0 + a1) + (a0 != 0)
+  int32x4x4_t tmp0;
-    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
+  int16x4x4_t in;
-    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
+  INIT_VECTOR4(in, zero, zero, zero, zero);
-    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
+  LOAD_LANE_16b(in.val[0], 0);
  LOAD_LANE_16b(in.val[1], 0);
  LOAD_LANE_16b(in.val[2], 0);
  LOAD_LANE_16b(in.val[3], 0);
  LOAD_LANE_16b(in.val[0], 1);
  LOAD_LANE_16b(in.val[1], 1);
  LOAD_LANE_16b(in.val[2], 1);
  LOAD_LANE_16b(in.val[3], 1);
  LOAD_LANE_16b(in.val[0], 2);
  LOAD_LANE_16b(in.val[1], 2);
  LOAD_LANE_16b(in.val[2], 2);
  LOAD_LANE_16b(in.val[3], 2);
  LOAD_LANE_16b(in.val[0], 3);
  LOAD_LANE_16b(in.val[1], 3);
  LOAD_LANE_16b(in.val[2], 3);
  LOAD_LANE_16b(in.val[3], 3);
-    // Transpose
+  {
-    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
+    // a0 = in[0 * 16] + in[2 * 16]
-    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
+    // a1 = in[1 * 16] + in[3 * 16]
-    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
+    // a2 = in[1 * 16] - in[3 * 16]
-    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
+    // a3 = in[0 * 16] - in[2 * 16]
-    "vtrn.32         q6, q7                     \n"
+    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
-    "vtrn.32         q8, q9                     \n"
+    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
-
+    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
-    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
+    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
-    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
+    tmp0.val[0] = vaddq_s32(a0, a1);
-    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
+    tmp0.val[1] = vaddq_s32(a3, a2);
-    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
+    tmp0.val[2] = vsubq_s32(a3, a2);
-
+    tmp0.val[3] = vsubq_s32(a0, a1);
    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
    "vmov.s32         q0, #3                    \n" // q0 = 3
    "vcgt.s32        q1, q4, #0                 \n" // (b0>0)
    "vqsub.s32       q2, q4, q1                 \n" // (b0+(b0>0))
    "vqadd.s32       q3, q2, q0                 \n" // (b0+(b0>0)+3)
    "vshrn.s32       d18, q3, #3                \n" // (b0+(b0>0)+3) >> 3
    "vcgt.s32        q1, q5, #0                 \n" // (b1>0)
    "vqsub.s32       q2, q5, q1                 \n" // (b1+(b1>0))
    "vqadd.s32       q3, q2, q0                 \n" // (b1+(b1>0)+3)
    "vshrn.s32       d19, q3, #3                \n" // (b1+(b1>0)+3) >> 3
    "vcgt.s32        q1, q6, #0                 \n" // (b2>0)
    "vqsub.s32       q2, q6, q1                 \n" // (b2+(b2>0))
    "vqadd.s32       q3, q2, q0                 \n" // (b2+(b2>0)+3)
    "vshrn.s32       d20, q3, #3                \n" // (b2+(b2>0)+3) >> 3
    "vcgt.s32        q1, q7, #0                 \n" // (b3>0)
    "vqsub.s32       q2, q7, q1                 \n" // (b3+(b3>0))
    "vqadd.s32       q3, q2, q0                 \n" // (b3+(b3>0)+3)
    "vshrn.s32       d21, q3, #3                \n" // (b3+(b3>0)+3) >> 3
    "vst1.16         {q9, q10}, [%[out]]        \n"
    : [in] "+r"(in)
    : [kStep] "r"(kStep), [out] "r"(out)
    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
      "q6", "q7", "q8", "q9", "q10"       // clobbered
  ) ;
  }
  {
    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
    // a0 = tmp[0 + i] + tmp[ 8 + i]
    // a1 = tmp[4 + i] + tmp[12 + i]
    // a2 = tmp[4 + i] - tmp[12 + i]
    // a3 = tmp[0 + i] - tmp[ 8 + i]
    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
    const int16x4_t out0 = vmovn_s32(b0);
    const int16x4_t out1 = vmovn_s32(b1);
    const int16x4_t out2 = vmovn_s32(b2);
    const int16x4_t out3 = vmovn_s32(b3);
    vst1_s16(out +  0, out0);
    vst1_s16(out +  4, out1);
    vst1_s16(out +  8, out2);
    vst1_s16(out + 12, out3);
  }
 }
 #undef LOAD_LANE_16b
 //------------------------------------------------------------------------------
 // Texture distortion
@ -435,9 +548,136 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.
 // This code works but is *slower* than the inlined-asm version below
 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
 // USE_INTRINSICS define.
 // With gcc-4.8, it's only slightly slower than the inlined.
 #if defined(USE_INTRINSICS)
 // Zero extend an uint16x4_t 'v' to an int32x4_t.
 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
  return vreinterpretq_s32_u32(vmovl_u16(v));
 }
 // Does a regular 4x4 transpose followed by an adjustment of the upper columns
 // in the inner rows to restore the source order of differences,
 // i.e., a0 - a1 | a3 - a2.
 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
  int32x4x4_t out = Transpose4x4(rows);
  // restore source order in the columns containing differences.
  const int32x2_t r1h = vget_high_s32(out.val[1]);
  const int32x2_t r2h = vget_high_s32(out.val[2]);
  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
  return out;
 }
 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
                                                   const uint8x8_t r2r3) {
  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
  const int16x8x2_t transpose =
      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
                                      vget_low_s16(transpose.val[1]));
  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
                                      vget_high_s16(transpose.val[1]));
  // [0]: tmp[3] [1]: tmp[2]
  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
  return res;
 }
 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
  // a0 = tmp[0 + i] + tmp[8 + i];
  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
  // a1 = tmp[4 + i] + tmp[12+ i];
  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
  // a2 = tmp[4 + i] - tmp[12+ i];
  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
  // a3 = tmp[0 + i] - tmp[8 + i];
  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
  const int32x4x4_t res = { { b0, b1, b2, b3 } };
  return res;
 }
 // Calculate the weighted sum of the rows in 'b'.
 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
                                      const int32x4_t w0, const int32x4_t w1,
                                      const int32x4_t w2, const int32x4_t w3) {
  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
  const int64x2_t sum1 = vpaddlq_s32(s3);
  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
  return sum2;
 }
 #define LOAD_LANE_32b(src, VALUE, LANE) \
    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
  uint32x4_t d0d1 = { 0, 0, 0, 0 };
  uint32x4_t d2d3 = { 0, 0, 0, 0 };
  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
  {
    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
    const uint16x8x2_t tmp =
        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
                                                    vget_low_u8(d2d3u8));
    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
                                                    vget_high_u8(d2d3u8));
    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
                                    vreinterpret_s32_s64(sum2));
    const int32x2_t res = vshr_n_s32(diff, 5);
    return vget_lane_s32(res, 0);
  }
 }
 #undef LOAD_LANE_32b
 #else
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // This uses a TTransform helper function in C
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
  const int kBPS = BPS;
@ -515,7 +755,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
    // q12/14 tmp[12-15]
    // These are still in 01 45 23 67 order. We fix it easily in the addition
-    // case but the subtraction propegates them.
+    // case but the subtraction propagates them.
    "vswp            d3, d27                  \n"
    "vswp            d19, d31                 \n"
@ -624,6 +864,8 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
  return sum;
 }
 #endif  // USE_INTRINSICS
 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                      const uint16_t* const w) {
  int D = 0;
@ -636,6 +878,179 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
  return D;
 }
 //------------------------------------------------------------------------------
 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
  int j;
  for (j = start_block; j < end_block; ++j) {
    int16_t out[16];
    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    {
      int k;
      const int16x8_t a0 = vld1q_s16(out + 0);
      const int16x8_t b0 = vld1q_s16(out + 8);
      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
      // Convert coefficients to bin.
      for (k = 0; k < 16; ++k) {
        histo->distribution[out[k]]++;
      }
    }
  }
 }
 //------------------------------------------------------------------------------
 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
                                        const uint8_t* const b,
                                        uint32x4_t* const sum) {
  const uint8x16_t a0 = vld1q_u8(a);
  const uint8x16_t b0 = vld1q_u8(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
 }
 // Horizontal sum of all four uint32_t values in 'sum'.
 static int SumToInt(uint32x4_t sum) {
  const uint64x2_t sum2 = vpaddlq_u32(sum);
  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
  return (int)sum3;
 }
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 16; ++y) {
    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
  }
  return SumToInt(sum);
 }
 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
  }
  return SumToInt(sum);
 }
 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
    const uint8x8_t a0 = vld1_u8(a + y * BPS);
    const uint8x8_t b0 = vld1_u8(b + y * BPS);
    const uint8x8_t abs_diff = vabd_u8(a0, b0);
    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
    sum = vpadalq_u16(sum, prod);
  }
  return SumToInt(sum);
 }
 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  const uint8x16_t a0 = Load4x4(a);
  const uint8x16_t b0 = Load4x4(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
  return SumToInt(vpaddlq_u16(prod));
 }
 //------------------------------------------------------------------------------
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)
 static int16x8_t Quantize(int16_t* const in,
                          const VP8Matrix* const mtx, int offset) {
  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
  const int16x8_t a = vld1q_s16(in + offset);                // in
  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
  vst1q_s16(in + offset, c4);
  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
  return c3;
 }
 static const uint8_t kShuffles[4][8] = {
  { 0,   1,  2,  3,  8,  9, 16, 17 },
  { 10, 11,  4,  5,  6,  7, 12, 13 },
  { 18, 19, 24, 25, 26, 27, 20, 21 },
  { 14, 15, 22, 23, 28, 29, 30, 31 }
 };
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
  const int16x8_t out0 = Quantize(in, mtx, 0);
  const int16x8_t out1 = Quantize(in, mtx, 8);
  uint8x8x4_t shuffles;
  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
  // non-standard versions there.
 #if defined(__APPLE__) && defined(__aarch64__) && \
    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
  uint8x16x2_t all_out;
  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
  INIT_VECTOR4(shuffles,
               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
 #else
  uint8x8x4_t all_out;
  INIT_VECTOR4(all_out,
               vreinterpret_u8_s16(vget_low_s16(out0)),
               vreinterpret_u8_s16(vget_high_s16(out0)),
               vreinterpret_u8_s16(vget_low_s16(out1)),
               vreinterpret_u8_s16(vget_high_s16(out1)));
  INIT_VECTOR4(shuffles,
               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
 #endif
  // Zigzag reordering
  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
  // test zeros
  if (*(uint64_t*)(out +  0) != 0) return 1;
  if (*(uint64_t*)(out +  4) != 0) return 1;
  if (*(uint64_t*)(out +  8) != 0) return 1;
  if (*(uint64_t*)(out + 12) != 0) return 1;
  return 0;
 }
 #endif   // !WORK_AROUND_GCC
 #endif   // WEBP_USE_NEON
 //------------------------------------------------------------------------------
@ -648,14 +1063,17 @@ void VP8EncDspInitNEON(void) {
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
  VP8ITransformWHT = ITransformWHT;
  VP8FTransformWHT = FTransformWHT;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8CollectHistogram = CollectHistogram;
  VP8SSE16x16 = SSE16x16;
  VP8SSE16x8 = SSE16x8;
  VP8SSE8x8 = SSE8x8;
  VP8SSE4x4 = SSE4x4;
 #if !defined(WORK_AROUND_GCC)
  VP8EncQuantizeBlock = QuantizeBlock;
 #endif
 #endif   // WEBP_USE_NEON
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of speed-critical encoding functions.
@ -11,15 +13,13 @@
 #include "./dsp.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #if defined(WEBP_USE_SSE2)
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 #include "../enc/cost.h"
 #include "../enc/vp8enci.h"
 #include "../utils/utils.h"
 //------------------------------------------------------------------------------
 // Quite useful macro for debugging. Left here for convenience.
@ -54,7 +54,7 @@ static void PrintReg(const __m128i r, const char* const name, int size) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
-static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
@ -100,7 +100,7 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
 // Transforms (Paragraph 14.4)
 // Does one or two inverse transforms.
-static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
                       int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
@ -320,8 +320,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  }
 }
-static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k937 = _mm_set1_epi32(937);
@ -446,17 +445,50 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    // -> f1 = f1 + 1 - (a3 == 0)
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
-    _mm_storel_epi64((__m128i*)&out[ 0], d0);
+    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
-    _mm_storel_epi64((__m128i*)&out[ 4], g1);
+    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
-    _mm_storel_epi64((__m128i*)&out[ 8], d2);
+    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
-    _mm_storel_epi64((__m128i*)&out[12], f3);
+    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
  }
 }
 static void FTransformWHT(const int16_t* in, int16_t* out) {
  int32_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
    const int a0 = (in[0 * 16] + in[2 * 16]);
    const int a1 = (in[1 * 16] + in[3 * 16]);
    const int a2 = (in[1 * 16] - in[3 * 16]);
    const int a3 = (in[0 * 16] - in[2 * 16]);
    tmp[0 + i * 4] = a0 + a1;
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  {
    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
    const __m128i a0 = _mm_add_epi32(src0, src2);
    const __m128i a1 = _mm_add_epi32(src1, src3);
    const __m128i a2 = _mm_sub_epi32(src1, src3);
    const __m128i a3 = _mm_sub_epi32(src0, src2);
    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
    const __m128i out0 = _mm_packs_epi32(b0, b1);
    const __m128i out1 = _mm_packs_epi32(b2, b3);
    _mm_storeu_si128((__m128i*)&out[0], out0);
    _mm_storeu_si128((__m128i*)&out[8], out1);
  }
 }
 //------------------------------------------------------------------------------
 // Metric
-static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
+static int SSE_Nx4(const uint8_t* a, const uint8_t* b,
                   int num_quads, int do_16) {
  const __m128i zero = _mm_setzero_si128();
  __m128i sum1 = zero;
@ -534,19 +566,19 @@ static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
  }
 }
-static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 4, 1);
+  return SSE_Nx4(a, b, 4, 1);
 }
-static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 2, 1);
+  return SSE_Nx4(a, b, 2, 1);
 }
-static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 2, 0);
+  return SSE_Nx4(a, b, 2, 0);
 }
-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();
  // Load values. Note that we read 8 pixels instead of 4,
@ -603,13 +635,13 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
 // Hadamard transform
 // Returns the difference between the weighted sum of the absolute value of
 // transformed coefficients.
-static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
                      const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
-  // Load, combine and tranpose inputs.
+  // Load, combine and transpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
@ -751,19 +783,19 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }
-static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
-  const int diff_sum = TTransformSSE2(a, b, w);
+  const int diff_sum = TTransform(a, b, w);
  return abs(diff_sum) >> 5;
 }
-static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4SSE2(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -773,9 +805,9 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //
-// Simple quantization
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
-                             int n, const VP8Matrix* const mtx) {
+                                       const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
@ -787,20 +819,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
-  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
+  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
-  const __m128i sign0 = _mm_srai_epi16(in0, 15);
+  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
-  const __m128i sign8 = _mm_srai_epi16(in8, 15);
+  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
@ -809,32 +835,35 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  coeff8 = _mm_sub_epi16(coeff8, sign8);
  // coeff = abs(in) + sharpen
  if (sharpen != NULL) {
    const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
    const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
    coeff0 = _mm_add_epi16(coeff0, sharpen0);
    coeff8 = _mm_add_epi16(coeff8, sharpen8);
  }
-  // out = (coeff * iQ + B) >> QFIX;
+  // out = (coeff * iQ + B) >> QFIX
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
-    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
-    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
-    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
-    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
    const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
    const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
    const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
-    // out = (coeff * iQ + B) >> QFIX;
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
@ -859,17 +888,8 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);
  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
  _mm_storeu_si128((__m128i*)&in[0], in0);
  _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }
  // zigzag the output before storing it.
  //
@ -896,14 +916,44 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  }
  // detect if all 'out' values are zeroes or not
-  {
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
 }
-    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
+
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
 }
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                            const VP8Matrix* const mtx) {
  return DoQuantizeBlock(in, out, NULL, mtx);
 }
 // Forward declaration.
 void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
                              VP8Residual* const res);
 void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
                              VP8Residual* const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE to compare 8 values with a single instruction.
  const __m128i zero = _mm_setzero_si128();
  const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
  const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
  // Get the comparison results as a bitmask, consisting of two times 16 bits:
  // two identical bits for each result. Concatenate both bitmasks to get a
  // single 32 bit value. Negate the mask to get the position of entries that
  // are not equal to zero. We don't need to mask out least significant bits
  // according to res->first, since coeffs[0] is 0 if res->first > 0
  const uint32_t mask =
      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
  // The position of the most significant non-zero bit indicates the position of
  // the last non-zero value. Divide the result by two because __movemask_epi8
  // operates on 8 bit values instead of 16 bit values.
  assert(res->first == 0 || coeffs[0] == 0);
  res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
  res->coeffs = coeffs;
 }
 #endif   // WEBP_USE_SSE2
@ -915,19 +965,18 @@ extern void VP8EncDspInitSSE2(void);
 void VP8EncDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
-  VP8CollectHistogram = CollectHistogramSSE2;
+  VP8CollectHistogram = CollectHistogram;
-  VP8EncQuantizeBlock = QuantizeBlockSSE2;
+  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8ITransform = ITransformSSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8FTransform = FTransformSSE2;
+  VP8ITransform = ITransform;
-  VP8SSE16x16 = SSE16x16SSE2;
+  VP8FTransform = FTransform;
-  VP8SSE16x8 = SSE16x8SSE2;
+  VP8FTransformWHT = FTransformWHT;
-  VP8SSE8x8 = SSE8x8SSE2;
+  VP8SSE16x16 = SSE16x16;
-  VP8SSE4x4 = SSE4x4SSE2;
+  VP8SSE16x8 = SSE16x8;
-  VP8TDisto4x4 = Disto4x4SSE2;
+  VP8SSE8x8 = SSE8x8;
-  VP8TDisto16x16 = Disto16x16SSE2;
+  VP8SSE4x4 = SSE4x4;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
 #endif   // WEBP_USE_SSE2
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Image transforms and color space conversion methods for lossless decoder.
@ -16,10 +18,62 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#include "../enc/histogram.h"
 #include "../utils/utils.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Signatures and generic function-pointers
 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
 extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 typedef struct {
  // Note: the members are uint8_t, so that any negative values are
  // automatically converted to "mod 256" values.
  uint8_t green_to_red_;
  uint8_t green_to_blue_;
  uint8_t red_to_blue_;
 } VP8LMultipliers;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
                                       uint32_t* argb_data, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 extern VP8LTransformColorFunc VP8LTransformColorInverse;
 typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
                                uint8_t* dst);
 extern VP8LConvertFunc VP8LConvertBGRAToRGB;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 // Expose some C-only fallback functions
 void VP8LTransformColor_C(const VP8LMultipliers* const m,
                          uint32_t* data, int num_pixels);
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                 uint32_t* data, int num_pixels);
 void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
                                 int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
                               int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
 void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
 //------------------------------------------------------------------------------
 // Image transforms.
@ -33,14 +87,18 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                          int row_start, int row_end,
                          const uint32_t* const in, uint32_t* const out);
-// Subtracts green from blue and red channels.
+// Similar to the static method ColorIndexInverseTransform() that is part of
-void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
 // uint32_t) arguments for 'src' and 'dst'.
 void VP8LColorIndexInverseTransformAlpha(
    const struct VP8LTransform* const transform, int y_start, int y_end,
    const uint8_t* src, uint8_t* dst);
 void VP8LResidualImage(int width, int height, int bits,
                       uint32_t* const argb, uint32_t* const argb_scratch,
                       uint32_t* const image);
-void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                             uint32_t* const argb, uint32_t* image);
 //------------------------------------------------------------------------------
@ -59,20 +117,116 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 // -----------------------------------------------------------------------------
 // Faster logarithm for integers. Small values use a look-up table.
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-extern float VP8LFastLog2Slow(int v);
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
-extern float VP8LFastSLog2Slow(int v);
+
-static WEBP_INLINE float VP8LFastLog2(int v) {
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
 extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
 static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
 // Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(int v) {
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
 }
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
 typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
 typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
                                       int length);
 extern VP8LCostFunc VP8LExtraCost;
 extern VP8LCostCombinedFunc VP8LExtraCostCombined;
 typedef struct {        // small struct to hold counters
  int counts[2];        // index: 0=zero steak, 1=non-zero streak
  int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;
 typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
                                         int length);
 typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
                                                 const uint32_t* Y, int length);
 extern VP8LCostCountFunc VP8LHuffmanCostCount;
 extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
 typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
                                     const VP8LHistogram* const b,
                                     VP8LHistogram* const out);
 extern VP8LHistogramAddFunc VP8LHistogramAdd;
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
  const int log_floor = BitsLog2Floor(n);
  if (n == (n & ~(n - 1)))  // zero or a power of two.
    return log_floor;
  else
    return log_floor + 1;
 }
 // Splitting of distance and length codes into prefixes and
 // extra bits. The prefixes are encoded with an entropy code
 // while the extra bits are stored just as normal bits.
 static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
                                                  int* const extra_bits) {
  const int highest_bit = BitsLog2Floor(--distance);
  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
  *extra_bits = highest_bit - 1;
  *code = 2 * highest_bit + second_highest_bit;
 }
 static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
                                              int* const extra_bits,
                                              int* const extra_bits_value) {
  const int highest_bit = BitsLog2Floor(--distance);
  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
  *extra_bits = highest_bit - 1;
  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
  *code = 2 * highest_bit + second_highest_bit;
 }
 #define PREFIX_LOOKUP_IDX_MAX   512
 typedef struct {
  int8_t code_;
  int8_t extra_bits_;
 } VP8LPrefixCode;
 // These tables are derived using VP8LPrefixEncodeNoLUT.
 extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
 extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
 static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
                                             int* const extra_bits) {
  if (distance < PREFIX_LOOKUP_IDX_MAX) {
    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
    *code = prefix_code.code_;
    *extra_bits = prefix_code.extra_bits_;
  } else {
    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
  }
 }
 static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
                                         int* const extra_bits,
                                         int* const extra_bits_value) {
  if (distance < PREFIX_LOOKUP_IDX_MAX) {
    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
    *code = prefix_code.code_;
    *extra_bits = prefix_code.extra_bits_;
    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
  } else {
    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
  }
 }
 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
@ -83,9 +237,12 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 }
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                        int xbits, uint32_t* const dst);
 //------------------------------------------------------------------------------
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dsp/lossless_mips32.c
+++ b/src/dsp/lossless_mips32.c
@ -0,0 +1,416 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MIPS version of lossless functions
 //
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 #include "./dsp.h"
 #include "./lossless.h"
 #if defined(WEBP_USE_MIPS32)
 #include <assert.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #define APPROX_LOG_WITH_CORRECTION_MAX  65536
 #define APPROX_LOG_MAX                   4096
 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 static float FastSLog2Slow(uint32_t v) {
  assert(v >= LOG_LOOKUP_IDX_MAX);
  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
    uint32_t log_cnt, y, correction;
    const int c24 = 24;
    const float v_f = (float)v;
    uint32_t temp;
    // Xf = 256 = 2^8
    // log_cnt is index of leading one in upper 24 bits
    __asm__ volatile(
      "clz      %[log_cnt], %[v]                      \n\t"
      "addiu    %[y],       $zero,        1           \n\t"
      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
        [temp]"=r"(temp)
      : [c24]"r"(c24), [v]"r"(v)
    );
    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
    // Xf = floor(Xf) * (1 + (v % y) / v)
    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
    // The correction factor: log(1 + d) ~ d; for very small d values, so
    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
    // LOG_2_RECIPROCAL ~ 23/16
    // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
    correction = (23 * (v & (y - 1))) >> 4;
    return v_f * (kLog2Table[temp] + log_cnt) + correction;
  } else {
    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
  }
 }
 static float FastLog2Slow(uint32_t v) {
  assert(v >= LOG_LOOKUP_IDX_MAX);
  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
    uint32_t log_cnt, y;
    const int c24 = 24;
    double log_2;
    uint32_t temp;
    __asm__ volatile(
      "clz      %[log_cnt], %[v]                      \n\t"
      "addiu    %[y],       $zero,        1           \n\t"
      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
        [temp]"=r"(temp)
      : [c24]"r"(c24), [v]"r"(v)
    );
    log_2 = kLog2Table[temp] + log_cnt;
    if (v >= APPROX_LOG_MAX) {
      // Since the division is still expensive, add this correction factor only
      // for large values of 'v'.
      const uint32_t correction = (23 * (v & (y - 1))) >> 4;
      log_2 += (double)correction / v;
    }
    return (float)log_2;
  } else {
    return (float)(LOG_2_RECIPROCAL * log((double)v));
  }
 }
 // C version of this function:
 //   int i = 0;
 //   int64_t cost = 0;
 //   const uint32_t* pop = &population[4];
 //   const uint32_t* LoopEnd = &population[length];
 //   while (pop != LoopEnd) {
 //     ++i;
 //     cost += i * *pop;
 //     cost += i * *(pop + 1);
 //     pop += 2;
 //   }
 //   return (double)cost;
 static double ExtraCost(const uint32_t* const population, int length) {
  int i, temp0, temp1;
  const uint32_t* pop = &population[4];
  const uint32_t* const LoopEnd = &population[length];
  __asm__ volatile(
    "mult   $zero,    $zero                  \n\t"
    "xor    %[i],     %[i],       %[i]       \n\t"
    "beq    %[pop],   %[LoopEnd], 2f         \n\t"
  "1:                                        \n\t"
    "lw     %[temp0], 0(%[pop])              \n\t"
    "lw     %[temp1], 4(%[pop])              \n\t"
    "addiu  %[i],     %[i],       1          \n\t"
    "addiu  %[pop],   %[pop],     8          \n\t"
    "madd   %[i],     %[temp0]               \n\t"
    "madd   %[i],     %[temp1]               \n\t"
    "bne    %[pop],   %[LoopEnd], 1b         \n\t"
  "2:                                        \n\t"
    "mfhi   %[temp0]                         \n\t"
    "mflo   %[temp1]                         \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
      [i]"=&r"(i), [pop]"+r"(pop)
    : [LoopEnd]"r"(LoopEnd)
    : "memory", "hi", "lo"
  );
  return (double)((int64_t)temp0 << 32 | temp1);
 }
 // C version of this function:
 //   int i = 0;
 //   int64_t cost = 0;
 //   const uint32_t* pX = &X[4];
 //   const uint32_t* pY = &Y[4];
 //   const uint32_t* LoopEnd = &X[length];
 //   while (pX != LoopEnd) {
 //     const uint32_t xy0 = *pX + *pY;
 //     const uint32_t xy1 = *(pX + 1) + *(pY + 1);
 //     ++i;
 //     cost += i * xy0;
 //     cost += i * xy1;
 //     pX += 2;
 //     pY += 2;
 //   }
 //   return (double)cost;
 static double ExtraCostCombined(const uint32_t* const X,
                                const uint32_t* const Y, int length) {
  int i, temp0, temp1, temp2, temp3;
  const uint32_t* pX = &X[4];
  const uint32_t* pY = &Y[4];
  const uint32_t* const LoopEnd = &X[length];
  __asm__ volatile(
    "mult   $zero,    $zero                  \n\t"
    "xor    %[i],     %[i],       %[i]       \n\t"
    "beq    %[pX],    %[LoopEnd], 2f         \n\t"
  "1:                                        \n\t"
    "lw     %[temp0], 0(%[pX])               \n\t"
    "lw     %[temp1], 0(%[pY])               \n\t"
    "lw     %[temp2], 4(%[pX])               \n\t"
    "lw     %[temp3], 4(%[pY])               \n\t"
    "addiu  %[i],     %[i],       1          \n\t"
    "addu   %[temp0], %[temp0],   %[temp1]   \n\t"
    "addu   %[temp2], %[temp2],   %[temp3]   \n\t"
    "addiu  %[pX],    %[pX],      8          \n\t"
    "addiu  %[pY],    %[pY],      8          \n\t"
    "madd   %[i],     %[temp0]               \n\t"
    "madd   %[i],     %[temp2]               \n\t"
    "bne    %[pX],    %[LoopEnd], 1b         \n\t"
  "2:                                        \n\t"
    "mfhi   %[temp0]                         \n\t"
    "mflo   %[temp1]                         \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
      [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
    : [LoopEnd]"r"(LoopEnd)
    : "memory", "hi", "lo"
  );
  return (double)((int64_t)temp0 << 32 | temp1);
 }
 #define HUFFMAN_COST_PASS                                 \
  __asm__ volatile(                                       \
    "sll   %[temp1],  %[temp0],    3           \n\t"      \
    "addiu %[temp3],  %[streak],   -3          \n\t"      \
    "addu  %[temp2],  %[pstreaks], %[temp1]    \n\t"      \
    "blez  %[temp3],  1f                       \n\t"      \
    "srl   %[temp1],  %[temp1],    1           \n\t"      \
    "addu  %[temp3],  %[pcnts],    %[temp1]    \n\t"      \
    "lw    %[temp0],  4(%[temp2])              \n\t"      \
    "lw    %[temp1],  0(%[temp3])              \n\t"      \
    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
    "addiu %[temp1],  %[temp1],    1           \n\t"      \
    "sw    %[temp0],  4(%[temp2])              \n\t"      \
    "sw    %[temp1],  0(%[temp3])              \n\t"      \
    "b     2f                                  \n\t"      \
  "1:                                          \n\t"      \
    "lw    %[temp0],  0(%[temp2])              \n\t"      \
    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
    "sw    %[temp0],  0(%[temp2])              \n\t"      \
  "2:                                          \n\t"      \
    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),           \
      [temp3]"=&r"(temp3), [temp0]"+r"(temp0)             \
    : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts),         \
      [streak]"r"(streak)                                 \
    : "memory"                                            \
  );
 // Returns the various RLE counts
 static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
  int i;
  int streak = 0;
  VP8LStreaks stats;
  int* const pstreaks = &stats.streaks[0][0];
  int* const pcnts = &stats.counts[0];
  int temp0, temp1, temp2, temp3;
  memset(&stats, 0, sizeof(stats));
  for (i = 0; i < length - 1; ++i) {
    ++streak;
    if (population[i] == population[i + 1]) {
      continue;
    }
    temp0 = (population[i] != 0);
    HUFFMAN_COST_PASS
    streak = 0;
  }
  ++streak;
  temp0 = (population[i] != 0);
  HUFFMAN_COST_PASS
  return stats;
 }
 static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
                                            const uint32_t* Y, int length) {
  int i;
  int streak = 0;
  VP8LStreaks stats;
  int* const pstreaks = &stats.streaks[0][0];
  int* const pcnts = &stats.counts[0];
  int temp0, temp1, temp2, temp3;
  memset(&stats, 0, sizeof(stats));
  for (i = 0; i < length - 1; ++i) {
    const uint32_t xy = X[i] + Y[i];
    const uint32_t xy_next = X[i + 1] + Y[i + 1];
    ++streak;
    if (xy == xy_next) {
      continue;
    }
    temp0 = (xy != 0);
    HUFFMAN_COST_PASS
    streak = 0;
  }
  {
    const uint32_t xy = X[i] + Y[i];
    ++streak;
    temp0 = (xy != 0);
    HUFFMAN_COST_PASS
  }
  return stats;
 }
 #define ASM_START                                       \
  __asm__ volatile(                                     \
    ".set   push                            \n\t"       \
    ".set   at                              \n\t"       \
    ".set   macro                           \n\t"       \
  "1:                                       \n\t"
 // P2 = P0 + P1
 // A..D - offsets
 // E - temp variable to tell macro
 //     if pointer should be incremented
 // literal_ and successive histograms could be unaligned
 // so we must use ulw and usw
 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
  ".if "#E" == 1                            \n\t"       \
    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
  ".endif                                   \n\t"       \
    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
    ".set   pop                             \n\t"       \
 #define ASM_END_COMMON_0                                \
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),         \
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),         \
      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),         \
      [pa]"+r"(pa), [pout]"+r"(pout)
 #define ASM_END_COMMON_1                                \
    : [LoopEnd]"r"(LoopEnd)                             \
    : "memory", "at"                                    \
  );
 #define ASM_END_0                                       \
    ASM_END_COMMON_0                                    \
      , [pb]"+r"(pb)                                    \
    ASM_END_COMMON_1
 #define ASM_END_1                                       \
    ASM_END_COMMON_0                                    \
    ASM_END_COMMON_1
 #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE)  do {   \
  const uint32_t* pa = (const uint32_t*)(A);            \
  const uint32_t* pb = (const uint32_t*)(B);            \
  uint32_t* pout = (uint32_t*)(OUT);                    \
  const uint32_t* const LoopEnd = pa + (SIZE);          \
  assert((SIZE) % 4 == 0);                              \
  ASM_START                                             \
  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)              \
  ASM_END_0                                             \
  if ((EXTRA_SIZE) > 0) {                               \
    const int last = (EXTRA_SIZE);                      \
    int i;                                              \
    for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
  }                                                     \
 } while (0)
 #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE)  do {   \
  const uint32_t* pa = (const uint32_t*)(A);            \
  uint32_t* pout = (uint32_t*)(OUT);                    \
  const uint32_t* const LoopEnd = pa + (SIZE);          \
  assert((SIZE) % 4 == 0);                              \
  ASM_START                                             \
  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)            \
  ASM_END_1                                             \
  if ((EXTRA_SIZE) > 0) {                               \
    const int last = (EXTRA_SIZE);                      \
    int i;                                              \
    for (i = 0; i < last; ++i) pout[i] += pa[i];        \
  }                                                     \
 } while (0)
 static void HistogramAdd(const VP8LHistogram* const a,
                         const VP8LHistogram* const b,
                         VP8LHistogram* const out) {
  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
                             - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
  assert(a->palette_code_bits_ == b->palette_code_bits_);
  if (b != out) {
    ADD_VECTOR(a->literal_, b->literal_, out->literal_,
               NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
    ADD_VECTOR(a->distance_, b->distance_, out->distance_,
               NUM_DISTANCE_CODES, 0);
    ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
    ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
    ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  } else {
    ADD_VECTOR_EQ(a->literal_, out->literal_,
                  NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
    ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
    ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
    ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
    ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  }
 }
 #undef ADD_VECTOR_EQ
 #undef ADD_VECTOR
 #undef ASM_END_1
 #undef ASM_END_0
 #undef ASM_END_COMMON_1
 #undef ASM_END_COMMON_0
 #undef ADD_TO_OUT
 #undef ASM_START
 #endif  // WEBP_USE_MIPS32
 //------------------------------------------------------------------------------
 // Entry point
 extern void VP8LDspInitMIPS32(void);
 void VP8LDspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
  VP8LFastSLog2Slow = FastSLog2Slow;
  VP8LFastLog2Slow = FastLog2Slow;
  VP8LExtraCost = ExtraCost;
  VP8LExtraCostCombined = ExtraCostCombined;
  VP8LHuffmanCostCount = HuffmanCostCount;
  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
  VP8LHistogramAdd = HistogramAdd;
 #endif  // WEBP_USE_MIPS32
 }
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -0,0 +1,357 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // NEON variant of methods for lossless decoder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_NEON)
 #include <arm_neon.h>
 #include "./lossless.h"
 #include "./neon.h"
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
 #if !defined(WORK_AROUND_GCC)
 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
 // gcc-4.8.x at least.
 static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
    const uint8x16_t tmp = pixel.val[0];
    pixel.val[0] = pixel.val[2];
    pixel.val[2] = tmp;
    vst4q_u8(dst, pixel);
    dst += 64;
  }
  VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
 }
 static void ConvertBGRAToBGR(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
    vst3q_u8(dst, tmp);
    dst += 48;
  }
  VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
 }
 static void ConvertBGRAToRGB(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
    vst3q_u8(dst, tmp);
    dst += 48;
  }
  VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst);  // left-overs
 }
 #else  // WORK_AROUND_GCC
 // gcc-4.6.0 fallback
 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
 static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~1);
  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
  for (; src < end; src += 2) {
    const uint8x8_t pixels = vld1_u8((uint8_t*)src);
    vst1_u8(dst, vtbl1_u8(pixels, shuffle));
    dst += 8;
  }
  VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst);  // left-overs
 }
 static const uint8_t kBGRShuffle[3][8] = {
  {  0,  1,  2,  4,  5,  6,  8,  9 },
  { 10, 12, 13, 14, 16, 17, 18, 20 },
  { 21, 22, 24, 25, 26, 28, 29, 30 }
 };
 static void ConvertBGRAToBGR(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~7);
  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
  const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
  for (; src < end; src += 8) {
    uint8x8x4_t pixels;
    INIT_VECTOR4(pixels,
                 vld1_u8((const uint8_t*)(src + 0)),
                 vld1_u8((const uint8_t*)(src + 2)),
                 vld1_u8((const uint8_t*)(src + 4)),
                 vld1_u8((const uint8_t*)(src + 6)));
    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
    dst += 8 * 3;
  }
  VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst);  // left-overs
 }
 static const uint8_t kRGBShuffle[3][8] = {
  {  2,  1,  0,  6,  5,  4, 10,  9 },
  {  8, 14, 13, 12, 18, 17, 16, 22 },
  { 21, 20, 26, 25, 24, 30, 29, 28 }
 };
 static void ConvertBGRAToRGB(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~7);
  const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
  const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
  const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
  for (; src < end; src += 8) {
    uint8x8x4_t pixels;
    INIT_VECTOR4(pixels,
                 vld1_u8((const uint8_t*)(src + 0)),
                 vld1_u8((const uint8_t*)(src + 2)),
                 vld1_u8((const uint8_t*)(src + 4)),
                 vld1_u8((const uint8_t*)(src + 6)));
    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
    dst += 8 * 3;
  }
  VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst);  // left-overs
 }
 #endif   // !WORK_AROUND_GCC
 //------------------------------------------------------------------------------
 #ifdef USE_INTRINSICS
 static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
                                     const uint32_t* const b) {
  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  const uint8x8_t avg = vhadd_u8(a0, b0);
  return vget_lane_u32(vreinterpret_u32_u8(avg), 0);
 }
 static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
                                     const uint32_t* const b,
                                     const uint32_t* const c) {
  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
  const uint8x8_t avg1 = vhadd_u8(a0, c0);
  const uint8x8_t avg2 = vhadd_u8(avg1, b0);
  return vget_lane_u32(vreinterpret_u32_u8(avg2), 0);
 }
 static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
                                     const uint32_t* const b,
                                     const uint32_t* const c,
                                     const uint32_t* const d) {
  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
  const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d));
  const uint8x8_t avg1 = vhadd_u8(a0, b0);
  const uint8x8_t avg2 = vhadd_u8(c0, d0);
  const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
  return vget_lane_u32(vreinterpret_u32_u8(avg3), 0);
 }
 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
  return Average3(&left, top + 0, top + 1);
 }
 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
  return Average2(&left, top - 1);
 }
 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
  return Average2(&left, top + 0);
 }
 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
  (void)left;
  return Average2(top - 1, top + 0);
 }
 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
  (void)left;
  return Average2(top + 0, top + 1);
 }
 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
  return Average4(&left, top - 1, top + 0, top + 1);
 }
 //------------------------------------------------------------------------------
 static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
                                   const uint32_t* const c1,
                                   const uint32_t* const c2) {
  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
  const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
  const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
  const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
  const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
  const int32_t pa_minus_pb = vget_lane_s32(diff, 0);
  return (pa_minus_pb <= 0) ? *c0 : *c1;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
  return Select(top + 0, &left, top - 1);
 }
 static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0,
                                                   const uint32_t* const c1,
                                                   const uint32_t* const c2) {
  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  const uint16x8_t sum0 = vaddl_u8(p0, p1);                // add and widen
  const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2));  // widen and subtract
  const uint8x8_t out = vqmovn_u16(sum1);                  // narrow and clamp
  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
  return ClampedAddSubtractFull(&left, top + 0, top - 1);
 }
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
                                                   const uint32_t* const c1,
                                                   const uint32_t* const c2) {
  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
  const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
  const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
  const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
  return ClampedAddSubtractHalf(&left, top + 0, top - 1);
 }
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
 // non-standard versions there.
 #if defined(__APPLE__) && defined(__aarch64__) && \
    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
 #define USE_VTBLQ
 #endif
 #ifdef USE_VTBLQ
 // 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[16] = {
  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
 };
 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
                                             const uint8x16_t shuffle) {
  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
 }
 #else  // !USE_VTBLQ
 // 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
                                             const uint8x8_t shuffle) {
  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                     vtbl1_u8(vget_high_u8(argb), shuffle));
 }
 #endif  // USE_VTBLQ
 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
 #ifdef USE_VTBLQ
  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
 #else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
 #endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
 }
 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
 #ifdef USE_VTBLQ
  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
 #else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
 #endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
 }
 #undef USE_VTBLQ
 #endif   // USE_INTRINSICS
 #endif   // WEBP_USE_NEON
 //------------------------------------------------------------------------------
 extern void VP8LDspInitNEON(void);
 void VP8LDspInitNEON(void) {
 #if defined(WEBP_USE_NEON)
  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
 #ifdef USE_INTRINSICS
  VP8LPredictors[5] = Predictor5;
  VP8LPredictors[6] = Predictor6;
  VP8LPredictors[7] = Predictor7;
  VP8LPredictors[8] = Predictor8;
  VP8LPredictors[9] = Predictor9;
  VP8LPredictors[10] = Predictor10;
  VP8LPredictors[11] = Predictor11;
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
 #endif
 #endif   // WEBP_USE_NEON
 }
 //------------------------------------------------------------------------------
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@ -0,0 +1,535 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 variant of methods for lossless decoder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 #include <assert.h>
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 #include "./lossless.h"
 //------------------------------------------------------------------------------
 // Predictor Transform
 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  const __m128i V1 = _mm_add_epi16(C0, C1);
  const __m128i V2 = _mm_sub_epi16(V1, C2);
  const __m128i b = _mm_packus_epi16(V2, V2);
  const uint32_t output = _mm_cvtsi128_si32(b);
  return output;
 }
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  const __m128i avg = _mm_add_epi16(C1, C0);
  const __m128i A0 = _mm_srli_epi16(avg, 1);
  const __m128i A1 = _mm_sub_epi16(A0, B0);
  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
  const __m128i A3 = _mm_srai_epi16(A2, 1);
  const __m128i A4 = _mm_add_epi16(A0, A3);
  const __m128i A5 = _mm_packus_epi16(A4, A4);
  const uint32_t output = _mm_cvtsi128_si32(A5);
  return output;
 }
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
  int pa_minus_pb;
  const __m128i zero = _mm_setzero_si128();
  const __m128i A0 = _mm_cvtsi32_si128(a);
  const __m128i B0 = _mm_cvtsi32_si128(b);
  const __m128i C0 = _mm_cvtsi32_si128(c);
  const __m128i AC0 = _mm_subs_epu8(A0, C0);
  const __m128i CA0 = _mm_subs_epu8(C0, A0);
  const __m128i BC0 = _mm_subs_epu8(B0, C0);
  const __m128i CB0 = _mm_subs_epu8(C0, B0);
  const __m128i AC = _mm_or_si128(AC0, CA0);
  const __m128i BC = _mm_or_si128(BC0, CB0);
  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
  const __m128i diff = _mm_sub_epi16(pb, pa);
  {
    int16_t out[8];
    _mm_storeu_si128((__m128i*)out, diff);
    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  }
  return (pa_minus_pb <= 0) ? a : b;
 }
 static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  const __m128i sum = _mm_add_epi16(A1, A0);
  const __m128i avg = _mm_srli_epi16(sum, 1);
  return avg;
 }
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
  const __m128i avg = Average2_128i(a0, a1);
  const __m128i A2 = _mm_packus_epi16(avg, avg);
  const uint32_t output = _mm_cvtsi128_si32(A2);
  return output;
 }
 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i avg1 = Average2_128i(a0, a2);
  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  const __m128i sum = _mm_add_epi16(avg1, A1);
  const __m128i avg2 = _mm_srli_epi16(sum, 1);
  const __m128i A2 = _mm_packus_epi16(avg2, avg2);
  const uint32_t output = _mm_cvtsi128_si32(A2);
  return output;
 }
 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
                                     uint32_t a2, uint32_t a3) {
  const __m128i avg1 = Average2_128i(a0, a1);
  const __m128i avg2 = Average2_128i(a2, a3);
  const __m128i sum = _mm_add_epi16(avg2, avg1);
  const __m128i avg3 = _mm_srli_epi16(sum, 1);
  const __m128i A0 = _mm_packus_epi16(avg3, avg3);
  const uint32_t output = _mm_cvtsi128_si32(A0);
  return output;
 }
 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average3(left, top[0], top[1]);
  return pred;
 }
 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[-1]);
  return pred;
 }
 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[0]);
  return pred;
 }
 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[-1], top[0]);
  (void)left;
  return pred;
 }
 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[0], top[1]);
  (void)left;
  return pred;
 }
 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
  return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Select(top[0], left, top[-1]);
  return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  return pred;
 }
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const __m128i mask = _mm_set1_epi32(0x0000ff00);
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
    const __m128i out = _mm_sub_epi8(in, in_0g0g);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
 }
 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const __m128i mask = _mm_set1_epi32(0x0000ff00);
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
    const __m128i out = _mm_add_epi8(in, in_0g0g);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
 }
 //------------------------------------------------------------------------------
 // Color Transform
 static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
                                               __m128i color) {
  // We simulate signed 8-bit multiplication as:
  // * Left shift the two (8-bit) numbers by 8 bits,
  // * Perform a 16-bit signed multiplication and retain the higher 16-bits.
  const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
  const __m128i color_shifted = _mm_slli_epi32(color, 8);
  // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
  // happen to be zeroes.
  const __m128i signed_mult =
      _mm_mulhi_epi16(color_pred_shifted, color_shifted);
  return _mm_srli_epi32(signed_mult, 5);
 }
 static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
                                       uint32_t* argb_data,
                                       int num_pixels) {
  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
    const __m128i b = in;
    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
    const __m128i r_new =
        _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
    const __m128i b_new =
        _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // Fall-back to C-version for left-overs.
  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
 }
 static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
                                              uint32_t* argb_data,
                                              int num_pixels) {
  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
    const __m128i b = in;
    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
    const __m128i r_new =
        _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
    const __m128i b_new =
        _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);
    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // Fall-back to C-version for left-overs.
  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
 }
 //------------------------------------------------------------------------------
 // Color-space conversion functions
 static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
    _mm_storeu_si128(out++, rgba0);
    _mm_storeu_si128(out++, rgba4);
    num_pixels -= 8;
  }
  // left-overs
  VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
 }
 static void ConvertBGRAToRGBA4444(const uint32_t* src,
                                  int num_pixels, uint8_t* dst) {
  const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
    const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
    const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
    const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
    const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
 #ifdef WEBP_SWAP_16BIT_CSP
    const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
 #else
    const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
 #endif
    _mm_storeu_si128(out++, rgba);
    num_pixels -= 8;
  }
  // left-overs
  VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
 }
 static void ConvertBGRAToRGB565(const uint32_t* src,
                                int num_pixels, uint8_t* dst) {
  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
  const __m128i mask_0x07 = _mm_set1_epi8(0x07);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
    const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
    const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
    const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
    const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
    const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
    const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
    const __m128i b1 = _mm_srli_epi16(b0, 3);
    const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
 #ifdef WEBP_SWAP_16BIT_CSP
    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
 #else
    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
 #endif
    _mm_storeu_si128(out++, rgba);
    num_pixels -= 8;
  }
  // left-overs
  VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
 }
 static void ConvertBGRAToBGR(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
  const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
  const __m128i* in = (const __m128i*)src;
  const uint8_t* const end = dst + num_pixels * 3;
  // the last storel_epi64 below writes 8 bytes starting at offset 18
  while (dst + 26 <= end) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
    const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
    const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
    const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
    const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
    const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
    const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
    const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
    const __m128i c2 = _mm_srli_si128(c0, 8);
    const __m128i c6 = _mm_srli_si128(c4, 8);
    _mm_storel_epi64((__m128i*)(dst +   0), c0);
    _mm_storel_epi64((__m128i*)(dst +   6), c2);
    _mm_storel_epi64((__m128i*)(dst +  12), c4);
    _mm_storel_epi64((__m128i*)(dst +  18), c6);
    dst += 24;
    num_pixels -= 8;
  }
  // left-overs
  VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
 }
 //------------------------------------------------------------------------------
 #define LINE_SIZE 16    // 8 or 16
 static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
                      int size) {
  int i;
  assert(size % LINE_SIZE == 0);
  for (i = 0; i < size; i += LINE_SIZE) {
    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
 #if (LINE_SIZE == 16)
    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
 #endif
    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i +  0]);
    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i +  4]);
 #if (LINE_SIZE == 16)
    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i +  8]);
    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]);
 #endif
    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
 #if (LINE_SIZE == 16)
    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
 #endif
  }
 }
 static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
  int i;
  assert(size % LINE_SIZE == 0);
  for (i = 0; i < size; i += LINE_SIZE) {
    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
 #if (LINE_SIZE == 16)
    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
 #endif
    const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i +  0]);
    const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i +  4]);
 #if (LINE_SIZE == 16)
    const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i +  8]);
    const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]);
 #endif
    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
 #if (LINE_SIZE == 16)
    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
 #endif
  }
 }
 #undef LINE_SIZE
 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
 // that's ok since the histogram values are less than 1<<28 (max picture size).
 static void HistogramAdd(const VP8LHistogram* const a,
                         const VP8LHistogram* const b,
                         VP8LHistogram* const out) {
  int i;
  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
  assert(a->palette_code_bits_ == b->palette_code_bits_);
  if (b != out) {
    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
  } else {
    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
  }
  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
    out->literal_[i] = a->literal_[i] + b->literal_[i];
  }
  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
    out->distance_[i] = a->distance_[i] + b->distance_[i];
  }
 }
 #endif   // WEBP_USE_SSE2
 //------------------------------------------------------------------------------
 extern void VP8LDspInitSSE2(void);
 void VP8LDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
  VP8LPredictors[5] = Predictor5;
  VP8LPredictors[6] = Predictor6;
  VP8LPredictors[7] = Predictor7;
  VP8LPredictors[8] = Predictor8;
  VP8LPredictors[9] = Predictor9;
  VP8LPredictors[10] = Predictor10;
  VP8LPredictors[11] = Predictor11;
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
  VP8LTransformColor = TransformColor;
  VP8LTransformColorInverse = TransformColorInverse;
  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
  VP8LHistogramAdd = HistogramAdd;
 #endif   // WEBP_USE_SSE2
 }
 //------------------------------------------------------------------------------
--- a/src/dsp/neon.h
+++ b/src/dsp/neon.h
@ -0,0 +1,82 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  NEON common code.
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 #include <arm_neon.h>
 #include "./dsp.h"
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except aarch64 where the inline assembly is incompatible.
 #if defined(__aarch64__)
 #define USE_INTRINSICS   // use intrinsics when possible
 #endif
 #define INIT_VECTOR2(v, a, b) do {  \
  v.val[0] = a;                     \
  v.val[1] = b;                     \
 } while (0)
 #define INIT_VECTOR3(v, a, b, c) do {  \
  v.val[0] = a;                        \
  v.val[1] = b;                        \
  v.val[2] = c;                        \
 } while (0)
 #define INIT_VECTOR4(v, a, b, c, d) do {  \
  v.val[0] = a;                           \
  v.val[1] = b;                           \
  v.val[2] = c;                           \
  v.val[3] = d;                           \
 } while (0)
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
 #define WORK_AROUND_GCC
 #endif
 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
  uint64x2x2_t row01, row23;
  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
  // Transpose 64-bit values (there's no vswp equivalent)
  {
    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
  }
  {
    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
                                        vreinterpretq_s32_u64(row01.val[1]));
    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
                                        vreinterpretq_s32_u64(row23.val[1]));
    int32x4x4_t out;
    out.val[0] = out01.val[0];
    out.val[1] = out01.val[1];
    out.val[2] = out23.val[0];
    out.val[3] = out23.val[1];
    return out;
  }
 }
 #endif  // WEBP_DSP_NEON_H_
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV to RGB upsampling functions.
@ -12,9 +14,7 @@
 #include "./dsp.h"
 #include "./yuv.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#include <assert.h>
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Fancy upsampler
@ -43,11 +43,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
  const int last_pixel_pair = (len - 1) >> 1;                                  \
  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  if (top_y) {                                                                 \
+  assert(top_y != NULL);                                                       \
  {                                                                            \
    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
  }                                                                            \
-  if (bottom_y) {                                                              \
+  if (bottom_y != NULL) {                                                      \
    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
  }                                                                            \
@ -58,7 +59,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    if (top_y) {                                                               \
+    {                                                                          \
      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
@ -66,7 +67,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
           top_dst + (2 * x - 0) * XSTEP);                                     \
    }                                                                          \
-    if (bottom_y) {                                                            \
+    if (bottom_y != NULL) {                                                    \
      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
@ -78,12 +79,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    l_uv = uv;                                                                 \
  }                                                                            \
  if (!(len & 1)) {                                                            \
-    if (top_y) {                                                               \
+    {                                                                          \
      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
           top_dst + (len - 1) * XSTEP);                                       \
    }                                                                          \
-    if (bottom_y) {                                                            \
+    if (bottom_y != NULL) {                                                    \
      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
           bottom_dst + (len - 1) * XSTEP);                                    \
@ -105,57 +106,6 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
 #endif  // FANCY_UPSAMPLING
 //------------------------------------------------------------------------------
 // simple point-sampling
 #define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      const uint8_t* u, const uint8_t* v,                      \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
  int i;                                                                       \
  for (i = 0; i < len - 1; i += 2) {                                           \
    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
    top_y += 2;                                                                \
    bottom_y += 2;                                                             \
    u++;                                                                       \
    v++;                                                                       \
    top_dst += 2 * XSTEP;                                                      \
    bottom_dst += 2 * XSTEP;                                                   \
  }                                                                            \
  if (i == len - 1) {    /* last one */                                        \
    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
  }                                                                            \
 }
 // All variants implemented.
 SAMPLE_FUNC(SampleRgbLinePair,      VP8YuvToRgb,  3)
 SAMPLE_FUNC(SampleBgrLinePair,      VP8YuvToBgr,  3)
 SAMPLE_FUNC(SampleRgbaLinePair,     VP8YuvToRgba, 4)
 SAMPLE_FUNC(SampleBgraLinePair,     VP8YuvToBgra, 4)
 SAMPLE_FUNC(SampleArgbLinePair,     VP8YuvToArgb, 4)
 SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
 SAMPLE_FUNC(SampleRgb565LinePair,   VP8YuvToRgb565, 2)
 #undef SAMPLE_FUNC
 const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
  SampleRgbLinePair,       // MODE_RGB
  SampleRgbaLinePair,      // MODE_RGBA
  SampleBgrLinePair,       // MODE_BGR
  SampleBgraLinePair,      // MODE_BGRA
  SampleArgbLinePair,      // MODE_ARGB
  SampleRgba4444LinePair,  // MODE_RGBA_4444
  SampleRgb565LinePair,    // MODE_RGB_565
  SampleRgbaLinePair,      // MODE_rgbA
  SampleBgraLinePair,      // MODE_bgrA
  SampleArgbLinePair,      // MODE_Argb
  SampleRgba4444LinePair   // MODE_rgbA_4444
 };
 //------------------------------------------------------------------------------
 #if !defined(FANCY_UPSAMPLING)
@ -166,7 +116,8 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
  const int half_len = len >> 1;                                               \
  int x;                                                                       \
-  if (top_dst != NULL) {                                                       \
+  assert(top_dst != NULL);                                                     \
  {                                                                            \
    for (x = 0; x < half_len; ++x) {                                           \
      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
@ -233,85 +184,17 @@ const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
 };
 //------------------------------------------------------------------------------
-// Premultiplied modes
+// Main calls
-// non dithered-modes
+extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);
-// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
-// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
 // one can use instead: (x * a * 65793 + (1 << 23)) >> 24
 #if 1     // (int)(x * a / 255.)
 #define MULTIPLIER(a)   ((a) * 32897UL)
 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
 #else     // (int)(x * a / 255. + .5)
 #define MULTIPLIER(a) ((a) * 65793UL)
 #define PREMULTIPLY(x, m) (((x) * (m) + (1UL << 23)) >> 24)
 #endif
 static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
                               int w, int h, int stride) {
  while (h-- > 0) {
    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
    int i;
    for (i = 0; i < w; ++i) {
      const uint32_t a = alpha[4 * i];
      if (a != 0xff) {
        const uint32_t mult = MULTIPLIER(a);
        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
      }
    }
    rgba += stride;
  }
 }
 #undef MULTIPLIER
 #undef PREMULTIPLY
 // rgbA4444
 #define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
 static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
  return (x & 0xf0) | (x >> 4);
 }
 static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
  return (x & 0x0f) | (x << 4);
 }
 static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
  return (x * m) >> 16;
 }
 static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
                                   int w, int h, int stride) {
  while (h-- > 0) {
    int i;
    for (i = 0; i < w; ++i) {
      const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
      const uint32_t mult = MULTIPLIER(a);
      const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
      const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
      const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
      rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
      rgba4444[2 * i + 1] = (b & 0xf0) | a;
    }
    rgba4444 += stride;
  }
 }
 #undef MULTIPLIER
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
    = ApplyAlphaMultiply;
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int)
    = ApplyAlphaMultiply4444;
 //------------------------------------------------------------------------------
 // Main call
 void WebPInitUpsamplers(void) {
  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
 #ifdef FANCY_UPSAMPLING
  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
@ -320,6 +203,10 @@ void WebPInitUpsamplers(void) {
  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -335,33 +222,7 @@ void WebPInitUpsamplers(void) {
 #endif
  }
 #endif  // FANCY_UPSAMPLING
  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
-void WebPInitPremultiply(void) {
+//------------------------------------------------------------------------------
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply4444;
 #ifdef FANCY_UPSAMPLING
  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitPremultiplySSE2();
    }
 #endif
 #if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      WebPInitPremultiplyNEON();
    }
 #endif
  }
 #endif  // FANCY_UPSAMPLING
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // NEON version of YUV to RGB upsampling functions.
@ -12,19 +14,19 @@
 #include "./dsp.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #if defined(WEBP_USE_NEON)
 #include <assert.h>
 #include <arm_neon.h>
 #include <string.h>
 #include "./neon.h"
 #include "./yuv.h"
 #ifdef FANCY_UPSAMPLING
 //-----------------------------------------------------------------------------
 // U/V upsampling
 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
 #define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
  uint8x8_t a = vld1_u8(r1);                                            \
@ -60,8 +62,9 @@ extern "C" {
  d = vrhadd_u8(d, diag1);                                              \
                                                                        \
  {                                                                     \
-    const uint8x8x2_t a_b = {{ a, b }};                                 \
+    uint8x8x2_t a_b, c_d;                                               \
-    const uint8x8x2_t c_d = {{ c, d }};                                 \
+    INIT_VECTOR2(a_b, a, b);                                            \
    INIT_VECTOR2(c_d, c, d);                                            \
    vst2_u8(out,      a_b);                                             \
    vst2_u8(out + 32, c_d);                                             \
  }                                                                     \
@ -83,125 +86,94 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
  Upsample16Pixels(r1, r2, out);                                        \
 }
-#define CY  76283
+//-----------------------------------------------------------------------------
-#define CVR 89858
+// YUV->RGB conversion
 #define CUG 22014
 #define CVG 45773
 #define CUB 113618
-static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
+static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
 #define v255 vdup_n_u8(255)
 #define STORE_Rgb(out, r, g, b) do {                                    \
  uint8x8x3_t r_g_b;                                                    \
  INIT_VECTOR3(r_g_b, r, g, b);                                         \
  vst3_u8(out, r_g_b);                                                  \
 } while (0)
 #define STORE_Bgr(out, r, g, b) do {                                    \
  uint8x8x3_t b_g_r;                                                    \
  INIT_VECTOR3(b_g_r, b, g, r);                                         \
  vst3_u8(out, b_g_r);                                                  \
 } while (0)
 #define STORE_Rgba(out, r, g, b) do {                                   \
  uint8x8x4_t r_g_b_v255;                                               \
  INIT_VECTOR4(r_g_b_v255, r, g, b, v255);                              \
  vst4_u8(out, r_g_b_v255);                                             \
 } while (0)
 #define STORE_Bgra(out, r, g, b) do {                                   \
  uint8x8x4_t b_g_r_v255;                                               \
  INIT_VECTOR4(b_g_r_v255, b, g, r, v255);                              \
  vst4_u8(out, b_g_r_v255);                                             \
 } while (0)
 #define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
  int i;                                                                \
  for (i = 0; i < N; i += 8) {                                          \
-    int off = ((cur_x) + i) * XSTEP;                                    \
+    const int off = ((cur_x) + i) * XSTEP;                              \
-    uint8x8_t y  = vld1_u8(src_y + (cur_x)  + i);                       \
+    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
-    int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));             \
+    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
-    int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));            \
+    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
-    int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));            \
+    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
-                                                                        \
+    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
-    int16x8_t ud = vshlq_n_s16(uu, 1);                                  \
+    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
-    int16x8_t vd = vshlq_n_s16(vv, 1);                                  \
+    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
-                                                                        \
+    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
-    int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1),  \
+    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
-                                     vget_low_s16(vd),  cf16, 0);       \
+    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
-    int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
+    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
-                                     vget_high_s16(vd), cf16, 0);       \
+    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
-    int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16),                  \
+    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
-                                vrshrn_n_s32(vrh, 16));                 \
+    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
-                                                                        \
+    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
-    int32x4_t vl = vmovl_s16(vget_low_s16(vv));                         \
+    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
-    int32x4_t vh = vmovl_s16(vget_high_s16(vv));                        \
+    /* vrshrn_n_s32() already incorporates the rounding constant */     \
-    int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu),  cf16, 1);     \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
-    int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);     \
+                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
-    int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv),  cf16, 2);  \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
-    int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);  \
+                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
-    int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16),                  \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
-                                vrshrn_n_s32(gch, 16));                 \
+                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
-                                                                        \
+    STORE_ ## FMT(out + off, y, u, v);                                  \
    int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1),  \
                                     vget_low_s16(ud),  cf16, 3);       \
    int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
                                     vget_high_s16(ud), cf16, 3);       \
    int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16),                  \
                                vrshrn_n_s32(ubh, 16));                 \
                                                                        \
    int32x4_t rl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(vr));      \
    int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));     \
    int32x4_t gl = vsubl_s16(vget_low_s16(yy),  vget_low_s16(gc));      \
    int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));     \
    int32x4_t bl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(ub));      \
    int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));     \
                                                                        \
    rl = vmulq_lane_s32(rl, cf32, 0);                                   \
    rh = vmulq_lane_s32(rh, cf32, 0);                                   \
    gl = vmulq_lane_s32(gl, cf32, 0);                                   \
    gh = vmulq_lane_s32(gh, cf32, 0);                                   \
    bl = vmulq_lane_s32(bl, cf32, 0);                                   \
    bh = vmulq_lane_s32(bh, cf32, 0);                                   \
                                                                        \
    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16),                  \
                                 vrshrn_n_s32(rh, 16)));                \
    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16),                  \
                                 vrshrn_n_s32(gh, 16)));                \
    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16),                  \
                                 vrshrn_n_s32(bh, 16)));                \
    STR_ ## FMT(out + off, y, u, v);                                    \
  }                                                                     \
 }
-#define v255 vmov_n_u8(255)
+#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
 #define STR_Rgb(out, r, g, b) do {                                      \
  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
  vst3_u8(out, r_g_b);                                                  \
 } while (0)
 #define STR_Bgr(out, r, g, b) do {                                      \
  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
  vst3_u8(out, b_g_r);                                                  \
 } while (0)
 #define STR_Rgba(out, r, g, b) do {                                     \
  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
  vst4_u8(out, r_g_b_v255);                                             \
 } while (0)
 #define STR_Bgra(out, r, g, b) do {                                     \
  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
  vst4_u8(out, b_g_r_v255);                                             \
 } while (0)
 #define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) {            \
  int i;                                                                \
  for (i = 0; i < N; i++) {                                             \
-    int off = ((cur_x) + i) * XSTEP;                                    \
+    const int off = ((cur_x) + i) * XSTEP;                              \
-    int y = src_y[(cur_x) + i];                                         \
+    const int y = src_y[(cur_x) + i];                                   \
-    int u = (src_uv)[i];                                                \
+    const int u = (src_uv)[i];                                          \
-    int v = (src_uv)[i + 16];                                           \
+    const int v = (src_uv)[i + 16];                                     \
-    VP8YuvTo ## FMT(y, u, v, rgb + off);                                \
+    FUNC(y, u, v, rgb + off);                                           \
  }                                                                     \
 }
 #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
                      top_dst, bottom_dst, cur_x, len) {                \
  if (top_y) {                                                          \
  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
-  }                                                                     \
+  if (bottom_y != NULL) {                                               \
  if (bottom_y) {                                                       \
    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
  }                                                                     \
 }
-#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv,                  \
+#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
                      top_dst, bottom_dst, cur_x, len) {                \
-  if (top_y) {                                                          \
+  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
-    CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);               \
+  if (bottom_y != NULL) {                                               \
-  }                                                                     \
+    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
  if (bottom_y) {                                                       \
    CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
  }                                                                     \
 }
@ -223,18 +195,19 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
                                                                        \
-  const int16x4_t cf16 = vld1_s16(coef);                                \
+  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
-  const int32x2_t cf32 = vmov_n_s32(CY);                                \
+  const int32x2_t cf32 = vdup_n_s32(kUToB);                             \
-  const uint8x8_t u16  = vmov_n_u8(16);                                 \
+  const uint8x8_t u16  = vdup_n_u8(16);                                 \
-  const uint8x8_t u128 = vmov_n_u8(128);                                \
+  const uint8x8_t u128 = vdup_n_u8(128);                                \
                                                                        \
  /* Treat the first pixel in regular way */                            \
-  if (top_y) {                                                          \
+  assert(top_y != NULL);                                                \
  {                                                                     \
    const int u0 = (top_u[0] + u_diag) >> 1;                            \
    const int v0 = (top_v[0] + v_diag) >> 1;                            \
    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
  }                                                                     \
-  if (bottom_y) {                                                       \
+  if (bottom_y != NULL) {                                               \
    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
@ -253,15 +226,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
                                                                        \
  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
-  CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv,                      \
+  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
                top_dst, bottom_dst, last_pos, len - last_pos);         \
 }
 // NEON variants of the fancy upsampler.
-NEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair,  Rgb,  3)
-NEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair,  Bgr,  3)
-NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
-NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
 #endif  // FANCY_UPSAMPLING
@ -269,24 +242,26 @@ NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
 //------------------------------------------------------------------------------
 extern void WebPInitUpsamplersNEON(void);
 #ifdef FANCY_UPSAMPLING
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 void WebPInitUpsamplersNEON(void) {
 #if defined(WEBP_USE_NEON)
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
 #endif   // WEBP_USE_NEON
 }
-void WebPInitPremultiplyNEON(void) {
+#else
 #if defined(WEBP_USE_NEON)
  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
 #endif   // WEBP_USE_NEON
 }
-#if defined(__cplusplus) || defined(c_plusplus)
+// this empty function is to avoid an empty .o
-}    // extern "C"
+void WebPInitUpsamplersNEON(void) {}
-#endif
+
 #endif  // FANCY_UPSAMPLING
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of YUV to RGB upsampling functions.
@ -11,10 +13,6 @@
 #include "./dsp.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
@ -49,7 +47,7 @@ extern "C" {
  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
 } while (0)
-// pack and store two alterning pixel rows
+// pack and store two alternating pixel rows
 #define PACK_AND_STORE(a, b, da, db, out) do {                                 \
  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
@ -85,8 +83,8 @@ extern "C" {
  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                               \
  /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
+  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
-  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
+  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
 }
 // Turn the macro into a function for reducing code-size when non-critical
@ -106,82 +104,82 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
  Upsample32Pixels(r1, r2, out);                                               \
 }
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
  int n;                                                                       \
  if (top_y) {                                                                 \
  for (n = 0; n < (num_pixels); ++n) {                                         \
-      FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                          \
+    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
         top_dst + ((cur_x) + n) * XSTEP);                                     \
  }                                                                            \
-  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
  if (bottom_y) {                                                              \
    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
+      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
           bottom_dst + ((cur_x) + n) * XSTEP);                                \
    }                                                                          \
  }                                                                            \
 }
 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                       top_dst, bottom_dst, cur_x) do {                        \
  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
  if (bottom_y != NULL) {                                                      \
    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
             bottom_dst + (cur_x) * XSTEP);                                    \
  }                                                                            \
 } while (0)
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      const uint8_t* top_u, const uint8_t* top_v,              \
                      const uint8_t* cur_u, const uint8_t* cur_v,              \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int block;                                                                   \
+  int uv_pos, pos;                                                             \
-  /* 16 byte aligned array to cache reconstructed u and v */                   \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
  uint8_t uv_buf[4 * 32 + 15];                                                 \
-  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
-  const int uv_len = (len + 1) >> 1;                                           \
+  uint8_t* const r_v = r_u + 32;                                               \
  /* 17 pixels must be read-able for each block */                             \
  const int num_blocks = (uv_len - 1) >> 4;                                    \
  const int leftover = uv_len - num_blocks * 16;                               \
  const int last_pos = 1 + 32 * num_blocks;                                    \
                                                                               \
  assert(top_y != NULL);                                                       \
  {   /* Treat the first pixel in regular way */                               \
    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
-                                                                               \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
-  assert(len > 0);                                                             \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
-  /* Treat the first pixel in regular way */                                   \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
-  if (top_y) {                                                                 \
+    if (bottom_y != NULL) {                                                    \
-    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
-    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
-    FUNC(top_y[0], u0, v0, top_dst);                                           \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
    }                                                                          \
  if (bottom_y) {                                                              \
    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
  }                                                                            \
-                                                                               \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
-  for (block = 0; block < num_blocks; ++block) {                               \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
-    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
-    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
-                32 * block + 1, 32)                                            \
+  }                                                                            \
-    top_u += 16;                                                               \
+  if (len > 1) {                                                               \
-    cur_u += 16;                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
-    top_v += 16;                                                               \
+    assert(left_over > 0);                                                     \
-    cur_v += 16;                                                               \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
                pos, len - pos);                                               \
  }                                                                            \
                                                                               \
  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
              last_pos, len - last_pos);                                       \
 }
 // SSE2 variants of the fancy upsampler.
-SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2,  VP8YuvToRgb,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2,  VP8YuvToBgr,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
 #undef GET_M
 #undef PACK_AND_STORE
 #undef UPSAMPLE_32PIXELS
 #undef UPSAMPLE_LAST_BLOCK
 #undef CONVERT2RGB
 #undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC
 #endif  // FANCY_UPSAMPLING
@ -190,26 +188,27 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 //------------------------------------------------------------------------------
 extern void WebPInitUpsamplersSSE2(void);
 #ifdef FANCY_UPSAMPLING
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 void WebPInitUpsamplersSSE2(void) {
 #if defined(WEBP_USE_SSE2)
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
+  VP8YUVInitSSE2();
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
 #endif   // WEBP_USE_SSE2
 }
-void WebPInitPremultiplySSE2(void) {
+#else
 #if defined(WEBP_USE_SSE2)
  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
 #endif   // WEBP_USE_SSE2
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 // this empty function is to avoid an empty .o
 void WebPInitUpsamplersSSE2(void) {}
 #endif  // FANCY_UPSAMPLING
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -1,26 +1,19 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// YUV->RGB conversion function
+// YUV->RGB conversion functions
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./yuv.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#if defined(WEBP_YUV_USE_TABLE)
 extern "C" {
 #endif
 #ifdef WEBP_YUV_USE_TABLE
 int16_t VP8kVToR[256], VP8kUToB[256];
 int32_t VP8kVToG[256], VP8kUToG[256];
 uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 static int done = 0;
@ -28,6 +21,11 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
  return v < 0 ? 0 : v > max_value ? max_value : v;
 }
 int16_t VP8kVToR[256], VP8kUToB[256];
 int32_t VP8kVToG[256], VP8kUToG[256];
 uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 void VP8YUVInit(void) {
  int i;
  if (done) {
@ -68,6 +66,95 @@ void VP8YUVInit(void) {}
 #endif  // WEBP_YUV_USE_TABLE
-#if defined(__cplusplus) || defined(c_plusplus)
+//-----------------------------------------------------------------------------
-}    // extern "C"
+// Plain-C version
-#endif
+
 #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
 static void FUNC_NAME(const uint8_t* y,                                        \
                      const uint8_t* u, const uint8_t* v,                      \
                      uint8_t* dst, int len) {                                 \
  const uint8_t* const end = dst + (len & ~1) * XSTEP;                         \
  while (dst != end) {                                                         \
    FUNC(y[0], u[0], v[0], dst);                                               \
    FUNC(y[1], u[0], v[0], dst + XSTEP);                                       \
    y += 2;                                                                    \
    ++u;                                                                       \
    ++v;                                                                       \
    dst += 2 * XSTEP;                                                          \
  }                                                                            \
  if (len & 1) {                                                               \
    FUNC(y[0], u[0], v[0], dst);                                               \
  }                                                                            \
 }                                                                              \
 // All variants implemented.
 ROW_FUNC(YuvToRgbRow,      VP8YuvToRgb,  3)
 ROW_FUNC(YuvToBgrRow,      VP8YuvToBgr,  3)
 ROW_FUNC(YuvToRgbaRow,     VP8YuvToRgba, 4)
 ROW_FUNC(YuvToBgraRow,     VP8YuvToBgra, 4)
 ROW_FUNC(YuvToArgbRow,     VP8YuvToArgb, 4)
 ROW_FUNC(YuvToRgba4444Row, VP8YuvToRgba4444, 2)
 ROW_FUNC(YuvToRgb565Row,   VP8YuvToRgb565, 2)
 #undef ROW_FUNC
 // Main call for processing a plane with a WebPSamplerRowFunc function:
 void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
                             const uint8_t* u, const uint8_t* v, int uv_stride,
                             uint8_t* dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func) {
  int j;
  for (j = 0; j < height; ++j) {
    func(y, u, v, dst, width);
    y += y_stride;
    if (j & 1) {
      u += uv_stride;
      v += uv_stride;
    }
    dst += dst_stride;
  }
 }
 //-----------------------------------------------------------------------------
 // Main call
 WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 extern void WebPInitSamplersSSE2(void);
 extern void WebPInitSamplersMIPS32(void);
 static volatile VP8CPUInfo yuv_last_cpuinfo_used =
    (VP8CPUInfo)&yuv_last_cpuinfo_used;
 void WebPInitSamplers(void) {
  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
  WebPSamplers[MODE_BGRA]      = YuvToBgraRow;
  WebPSamplers[MODE_ARGB]      = YuvToArgbRow;
  WebPSamplers[MODE_RGBA_4444] = YuvToRgba4444Row;
  WebPSamplers[MODE_RGB_565]   = YuvToRgb565Row;
  WebPSamplers[MODE_rgbA]      = YuvToRgbaRow;
  WebPSamplers[MODE_bgrA]      = YuvToBgraRow;
  WebPSamplers[MODE_Argb]      = YuvToArgbRow;
  WebPSamplers[MODE_rgbA_4444] = YuvToRgba4444Row;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitSamplersSSE2();
    }
 #endif  // WEBP_USE_SSE2
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      WebPInitSamplersMIPS32();
    }
 #endif  // WEBP_USE_MIPS32
  }
  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
 //-----------------------------------------------------------------------------
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // inline YUV<->RGB conversion function
@ -12,7 +14,7 @@
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations for RGB->YUV conversion.
+// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
 //
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
@ -21,21 +23,24 @@
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 // In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
 // "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
-// So in this case the formulae should be read as:
+// So in this case the formulae should read:
 //   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
 //   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
 //   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
-// once factorized. Here too, 16bit fixed precision is used.
+// once factorized.
 // For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
 // That's the maximum possible for a convenient ARM implementation.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 #include "./dsp.h"
 #include "../dec/decode_vp8.h"
 // Define the following to use the LUT-based code:
-#define WEBP_YUV_USE_TABLE
+// #define WEBP_YUV_USE_TABLE
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
@ -50,25 +55,111 @@
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
-enum { YUV_FIX = 16,                // fixed-point precision
+enum {
  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
  YUV_HALF = 1 << (YUV_FIX - 1),
  YUV_MASK = (256 << YUV_FIX) - 1,
  YUV_RANGE_MIN = -227,            // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
  YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
-#ifdef WEBP_YUV_USE_TABLE
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
 #define kYScale 19077    // 1.164 = 255 / 219
 #define kVToR   26149    // 1.596 = 255 / 112 * 0.701
 #define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
 #define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
 #define kUToB   33050    // 2.018 = 255 / 112 * 0.886
 #define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
 #define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
 #define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
 //------------------------------------------------------------------------------
 #if !defined(WEBP_YUV_USE_TABLE)
 // slower on x86 by ~7-8%, but bit-exact with the SSE2 version
 static WEBP_INLINE int VP8Clip8(int v) {
  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
 }
 static WEBP_INLINE int VP8YUVToR(int y, int v) {
  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
 }
 static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
 }
 static WEBP_INLINE int VP8YUVToB(int y, int u) {
  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
 }
 static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                    uint8_t* const rgb) {
  rgb[0] = VP8YUVToR(y, v);
  rgb[1] = VP8YUVToG(y, u, v);
  rgb[2] = VP8YUVToB(y, u);
 }
 static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                    uint8_t* const bgr) {
  bgr[0] = VP8YUVToB(y, u);
  bgr[1] = VP8YUVToG(y, u, v);
  bgr[2] = VP8YUVToR(y, v);
 }
 static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
                                       uint8_t* const rgb) {
  const int r = VP8YUVToR(y, v);      // 5 usable bits
  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
  const int b = VP8YUVToB(y, u);      // 5 usable bits
  const int rg = (r & 0xf8) | (g >> 5);
  const int gb = ((g << 3) & 0xe0) | (b >> 3);
 #ifdef WEBP_SWAP_16BIT_CSP
  rgb[0] = gb;
  rgb[1] = rg;
 #else
  rgb[0] = rg;
  rgb[1] = gb;
 #endif
 }
 static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
                                         uint8_t* const argb) {
  const int r = VP8YUVToR(y, v);        // 4 usable bits
  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
  const int b = VP8YUVToB(y, u);        // 4 usable bits
  const int rg = (r & 0xf0) | (g >> 4);
  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
 #ifdef WEBP_SWAP_16BIT_CSP
  argb[0] = ba;
  argb[1] = rg;
 #else
  argb[0] = rg;
  argb[1] = ba;
 #endif
 }
 #else
 // Table-based version, not totally equivalent to the SSE2 version.
 // Rounding diff is only +/-1 though.
 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                    uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@ -78,7 +169,7 @@ static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }
-static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                    uint8_t* const bgr) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@ -88,14 +179,14 @@ static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }
-static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
                                       uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
-  const uint8_t rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  const uint8_t gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
 #ifdef WEBP_SWAP_16BIT_CSP
  rgb[0] = gb;
@ -106,94 +197,14 @@ static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
 #endif
 }
-static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
                                         uint8_t* const argb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
-  const uint8_t rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  const uint8_t ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
 #ifdef WEBP_SWAP_16BIT_CSP
  argb[0] = ba;
  argb[1] = rg;
 #else
  argb[0] = rg;
  argb[1] = ba;
 #endif
 }
 #else   // Table-free version (slower on x86)
 // These constants are 16b fixed-point version of ITU-R BT.601 constants
 #define kYScale 76309      // 1.164 = 255 / 219
 #define kVToR   104597     // 1.596 = 255 / 112 * 0.701
 #define kUToG   25674      // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
 #define kVToG   53278      // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
 #define kUToB   132201     // 2.018 = 255 / 112 * 0.886
 #define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF)
 #define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF)
 #define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF)
 static WEBP_INLINE uint8_t VP8Clip8(int v) {
  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX)
                                : (v < 0) ? 0u : 255u;
 }
 static WEBP_INLINE uint8_t VP8ClipN(int v, int N) {  // clip to N bits
  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N)))
                                : (v < 0) ? 0u : (255u >> (8 - N));
 }
 static WEBP_INLINE int VP8YUVToR(int y, int v) {
  return kYScale * y + kVToR * v + kRCst;
 }
 static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
  return kYScale * y - kUToG * u - kVToG * v + kGCst;
 }
 static WEBP_INLINE int VP8YUVToB(int y, int u) {
  return kYScale * y  + kUToB * u + kBCst;
 }
 static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
                                    uint8_t* const rgb) {
  rgb[0] = VP8Clip8(VP8YUVToR(y, v));
  rgb[1] = VP8Clip8(VP8YUVToG(y, u, v));
  rgb[2] = VP8Clip8(VP8YUVToB(y, u));
 }
 static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
                                    uint8_t* const bgr) {
  bgr[0] = VP8Clip8(VP8YUVToB(y, u));
  bgr[1] = VP8Clip8(VP8YUVToG(y, u, v));
  bgr[2] = VP8Clip8(VP8YUVToR(y, v));
 }
 static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
                                       uint8_t* const rgb) {
  const int r = VP8Clip8(VP8YUVToR(y, u));
  const int g = VP8ClipN(VP8YUVToG(y, u, v), 6);
  const int b = VP8ClipN(VP8YUVToB(y, v), 5);
  const uint8_t rg = (r & 0xf8) | (g >> 3);
  const uint8_t gb = (g << 5) | b;
 #ifdef WEBP_SWAP_16BIT_CSP
  rgb[0] = gb;
  rgb[1] = rg;
 #else
  rgb[0] = rg;
  rgb[1] = gb;
 #endif
 }
 static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
                                         uint8_t* const argb) {
  const int r = VP8Clip8(VP8YUVToR(y, u));
  const int g = VP8ClipN(VP8YUVToG(y, u, v), 4);
  const int b = VP8Clip8(VP8YUVToB(y, v));
  const uint8_t rg = (r & 0xf0) | g;
  const uint8_t ba = b | 0x0f;   // overwrite the lower 4 bits
 #ifdef WEBP_SWAP_16BIT_CSP
  argb[0] = ba;
  argb[1] = rg;
@ -205,6 +216,9 @@ static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
 #endif  // WEBP_YUV_USE_TABLE
 //-----------------------------------------------------------------------------
 // Alpha handling variants
 static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const argb) {
  argb[0] = 0xff;
@ -226,56 +240,81 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 //-----------------------------------------------------------------------------
 // SSE2 extra functions (mostly for upsampling_sse2.c)
 #if defined(WEBP_USE_SSE2)
 // When the following is defined, tables are initialized statically, adding ~12k
 // to the binary size. Otherwise, they are initialized at run-time (small cost).
 #define WEBP_YUV_USE_SSE2_TABLES
 #if defined(FANCY_UPSAMPLING)
 // Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst);
 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                   uint8_t* dst);
 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst);
 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                   uint8_t* dst);
 #endif  // FANCY_UPSAMPLING
 // Must be called to initialize tables before using the functions.
 void VP8YUVInitSSE2(void);
 #endif    // WEBP_USE_SSE2
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
-static WEBP_INLINE int VP8ClipUV(int v) {
+// Stub functions that can be called with various rounding values:
-  v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
+static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
-  return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
 }
 #ifndef USE_YUVj
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
  const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + kRound) >> YUV_FIX;  // no need to clip
+  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
 }
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
  const int u = -9719 * r - 19081 * g + 28800 * b;
-  return VP8ClipUV(u);
+  return VP8ClipUV(u, rounding);
 }
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
  const int v = +28800 * r - 24116 * g - 4684 * b;
-  return VP8ClipUV(v);
+  return VP8ClipUV(v, rounding);
 }
 #else
 // This JPEG-YUV colorspace, only for comparison!
-// These are also 16-bit precision coefficients from Rec.601, but with full
+// These are also 16bit precision coefficients from Rec.601, but with full
 // [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
  const int kRound = (1 << (YUV_FIX - 1));
  const int luma = 19595 * r + 38470 * g + 7471 * b;
-  return (luma + kRound) >> YUV_FIX;  // no need to clip
+  return (luma + rounding) >> YUV_FIX;  // no need to clip
 }
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
  const int u = -11058 * r - 21710 * g + 32768 * b;
-  return VP8ClipUV(u);
+  return VP8ClipUV(u, rounding);
 }
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
  const int v = 32768 * r - 27439 * g - 5329 * b;
-  return VP8ClipUV(v);
+  return VP8ClipUV(v, rounding);
 }
 #endif    // USE_YUVj
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dsp/yuv_mips32.c
+++ b/src/dsp/yuv_mips32.c
@ -0,0 +1,100 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MIPS version of YUV to RGB upsampling functions.
 //
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_MIPS32)
 #include "./yuv.h"
 //------------------------------------------------------------------------------
 // simple point-sampling
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
 static void FUNC_NAME(const uint8_t* y,                                        \
                      const uint8_t* u, const uint8_t* v,                      \
                      uint8_t* dst, int len) {                                 \
  int i, r, g, b;                                                              \
  int temp0, temp1, temp2, temp3, temp4;                                       \
  for (i = 0; i < (len >> 1); i++) {                                           \
    temp1 = kVToR * v[0];                                                      \
    temp3 = kVToG * v[0];                                                      \
    temp2 = kUToG * u[0];                                                      \
    temp4 = kUToB * u[0];                                                      \
    temp0 = kYScale * y[0];                                                    \
    temp1 += kRCst;                                                            \
    temp3 -= kGCst;                                                            \
    temp2 += temp3;                                                            \
    temp4 += kBCst;                                                            \
    r = VP8Clip8(temp0 + temp1);                                               \
    g = VP8Clip8(temp0 - temp2);                                               \
    b = VP8Clip8(temp0 + temp4);                                               \
    temp0 = kYScale * y[1];                                                    \
    dst[R] = r;                                                                \
    dst[G] = g;                                                                \
    dst[B] = b;                                                                \
    if (A) dst[A] = 0xff;                                                      \
    r = VP8Clip8(temp0 + temp1);                                               \
    g = VP8Clip8(temp0 - temp2);                                               \
    b = VP8Clip8(temp0 + temp4);                                               \
    dst[R + XSTEP] = r;                                                        \
    dst[G + XSTEP] = g;                                                        \
    dst[B + XSTEP] = b;                                                        \
    if (A) dst[A + XSTEP] = 0xff;                                              \
    y += 2;                                                                    \
    ++u;                                                                       \
    ++v;                                                                       \
    dst += 2 * XSTEP;                                                          \
  }                                                                            \
  if (len & 1) {                                                               \
    temp1 = kVToR * v[0];                                                      \
    temp3 = kVToG * v[0];                                                      \
    temp2 = kUToG * u[0];                                                      \
    temp4 = kUToB * u[0];                                                      \
    temp0 = kYScale * y[0];                                                    \
    temp1 += kRCst;                                                            \
    temp3 -= kGCst;                                                            \
    temp2 += temp3;                                                            \
    temp4 += kBCst;                                                            \
    r = VP8Clip8(temp0 + temp1);                                               \
    g = VP8Clip8(temp0 - temp2);                                               \
    b = VP8Clip8(temp0 + temp4);                                               \
    dst[R] = r;                                                                \
    dst[G] = g;                                                                \
    dst[B] = b;                                                                \
    if (A) dst[A] = 0xff;                                                      \
  }                                                                            \
 }
 ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
 ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
 ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
 ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
 #undef ROW_FUNC
 #endif   // WEBP_USE_MIPS32
 //------------------------------------------------------------------------------
 extern void WebPInitSamplersMIPS32(void);
 void WebPInitSamplersMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
 #endif  // WEBP_USE_MIPS32
 }
--- a/src/dsp/yuv_sse2.c
+++ b/src/dsp/yuv_sse2.c
@ -0,0 +1,322 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV->RGB conversion functions
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./yuv.h"
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 #include <string.h>   // for memcpy
 typedef union {   // handy struct for converting SSE2 registers
  int32_t i32[4];
  uint8_t u8[16];
  __m128i m;
 } VP8kCstSSE2;
 #if defined(WEBP_YUV_USE_SSE2_TABLES)
 #include "./yuv_tables_sse2.h"
 void VP8YUVInitSSE2(void) {}
 #else
 static int done_sse2 = 0;
 static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
 void VP8YUVInitSSE2(void) {
  if (!done_sse2) {
    int i;
    for (i = 0; i < 256; ++i) {
      VP8kYtoRGBA[i].i32[0] =
        VP8kYtoRGBA[i].i32[1] =
        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
      VP8kUtoRGBA[i].i32[0] = 0;
      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
      VP8kUtoRGBA[i].i32[3] = 0;
      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
      VP8kVtoRGBA[i].i32[2] = 0;
      VP8kVtoRGBA[i].i32[3] = 0;
    }
    done_sse2 = 1;
 #if 0   // code used to generate 'yuv_tables_sse2.h'
    printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
    for (i = 0; i < 256; ++i) {
      printf("  {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
             VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1],
             VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]);
    }
    printf("};\n\n");
    printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
    for (i = 0; i < 256; ++i) {
      printf("  {{0, 0x%.8x, 0x%.8x, 0}},\n",
             VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]);
    }
    printf("};\n\n");
    printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
    for (i = 0; i < 256; ++i) {
      printf("  {{0x%.8x, 0x%.8x, 0, 0}},\n",
             VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]);
    }
    printf("};\n\n");
 #endif
  }
 }
 #endif  // WEBP_YUV_USE_SSE2_TABLES
 //-----------------------------------------------------------------------------
 static WEBP_INLINE __m128i LoadUVPart(int u, int v) {
  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
  return uv_part;
 }
 static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
  return rgba2;
 }
 static WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) {
  const __m128i uv_part = LoadUVPart(u, v);
  return GetRGBA32bWithUV(y, uv_part);
 }
 static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const rgb) {
  const __m128i tmp0 = GetRGBA32b(y, u, v);
  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
  _mm_storel_epi64((__m128i*)rgb, tmp2);
 }
 static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const bgr) {
  const __m128i tmp0 = GetRGBA32b(y, u, v);
  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
  _mm_storel_epi64((__m128i*)bgr, tmp3);
 }
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 #ifdef FANCY_UPSAMPLING
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
  int n;
  for (n = 0; n < 32; n += 4) {
    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
    const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
    const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
    _mm_storeu_si128((__m128i*)dst, tmp2);
    dst += 4 * 4;
  }
 }
 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
  int n;
  for (n = 0; n < 32; n += 2) {
    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
    _mm_storel_epi64((__m128i*)dst, tmp3);
    dst += 4 * 2;
  }
 }
 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                   uint8_t* dst) {
  int n;
  uint8_t tmp0[2 * 3 + 5 + 15];
  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
    YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
  }
  // Last two pixels are special: we write in a tmp buffer before sending
  // to dst.
  YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
  YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
  memcpy(dst + n * 3, tmp, 2 * 3);
 }
 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                   uint8_t* dst) {
  int n;
  uint8_t tmp0[2 * 3 + 5 + 15];
  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
  for (n = 0; n < 30; ++n) {
    YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
  }
  YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
  YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
  memcpy(dst + n * 3, tmp, 2 * 3);
 }
 #endif  // FANCY_UPSAMPLING
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
 static void YuvToRgbaRowSSE2(const uint8_t* y,
                             const uint8_t* u, const uint8_t* v,
                             uint8_t* dst, int len) {
  int n;
  for (n = 0; n + 4 <= len; n += 4) {
    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
    const __m128i uv_1 = LoadUVPart(u[1], v[1]);
    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
    const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1);
    const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1);
    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
    _mm_storeu_si128((__m128i*)dst, tmp2);
    dst += 4 * 4;
    y += 4;
    u += 2;
    v += 2;
  }
  // Finish off
  while (n < len) {
    VP8YuvToRgba(y[0], u[0], v[0], dst);
    dst += 4;
    ++y;
    u += (n & 1);
    v += (n & 1);
    ++n;
  }
 }
 static void YuvToBgraRowSSE2(const uint8_t* y,
                             const uint8_t* u, const uint8_t* v,
                             uint8_t* dst, int len) {
  int n;
  for (n = 0; n + 2 <= len; n += 2) {
    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
    _mm_storel_epi64((__m128i*)dst, tmp3);
    dst += 4 * 2;
    y += 2;
    ++u;
    ++v;
  }
  // Finish off
  if (len & 1) {
    VP8YuvToBgra(y[0], u[0], v[0], dst);
  }
 }
 static void YuvToArgbRowSSE2(const uint8_t* y,
                             const uint8_t* u, const uint8_t* v,
                             uint8_t* dst, int len) {
  int n;
  for (n = 0; n + 2 <= len; n += 2) {
    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
    _mm_storel_epi64((__m128i*)dst, tmp3);
    dst += 4 * 2;
    y += 2;
    ++u;
    ++v;
  }
  // Finish off
  if (len & 1) {
    VP8YuvToArgb(y[0], u[0], v[0], dst);
  }
 }
 static void YuvToRgbRowSSE2(const uint8_t* y,
                            const uint8_t* u, const uint8_t* v,
                            uint8_t* dst, int len) {
  int n;
  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
    YuvToRgbSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
    dst += 3;
    ++y;
    u += (n & 1);
    v += (n & 1);
  }
  VP8YuvToRgb(y[0], u[0], v[0], dst);
  if (len > 1) {
    VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3);
  }
 }
 static void YuvToBgrRowSSE2(const uint8_t* y,
                            const uint8_t* u, const uint8_t* v,
                            uint8_t* dst, int len) {
  int n;
  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
    YuvToBgrSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
    dst += 3;
    ++y;
    u += (n & 1);
    v += (n & 1);
  }
  VP8YuvToBgr(y[0], u[0], v[0], dst + 0);
  if (len > 1) {
    VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3);
  }
 }
 #endif  // WEBP_USE_SSE2
 //------------------------------------------------------------------------------
 // Entry point
 extern void WebPInitSamplersSSE2(void);
 void WebPInitSamplersSSE2(void) {
 #if defined(WEBP_USE_SSE2)
  WebPSamplers[MODE_RGB]  = YuvToRgbRowSSE2;
  WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2;
  WebPSamplers[MODE_BGR]  = YuvToBgrRowSSE2;
  WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2;
  WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2;
 #endif  // WEBP_USE_SSE2
 }
--- a/src/dsp/yuv_tables_sse2.h
+++ b/src/dsp/yuv_tables_sse2.h
@ -0,0 +1,536 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 tables for YUV->RGB conversion (12kB overall)
 //
 // Author: Skal (pascal.massimino@gmail.com)
 // This file is not compiled, but #include'd directly from yuv.c
 // Only used if WEBP_YUV_USE_SSE2_TABLES is defined.
 static const VP8kCstSSE2 VP8kYtoRGBA[256] = {
  {{0xfffb77b0, 0xfffb77b0, 0xfffb77b0, 0x003fc000}},
  {{0xfffbc235, 0xfffbc235, 0xfffbc235, 0x003fc000}},
  {{0xfffc0cba, 0xfffc0cba, 0xfffc0cba, 0x003fc000}},
  {{0xfffc573f, 0xfffc573f, 0xfffc573f, 0x003fc000}},
  {{0xfffca1c4, 0xfffca1c4, 0xfffca1c4, 0x003fc000}},
  {{0xfffcec49, 0xfffcec49, 0xfffcec49, 0x003fc000}},
  {{0xfffd36ce, 0xfffd36ce, 0xfffd36ce, 0x003fc000}},
  {{0xfffd8153, 0xfffd8153, 0xfffd8153, 0x003fc000}},
  {{0xfffdcbd8, 0xfffdcbd8, 0xfffdcbd8, 0x003fc000}},
  {{0xfffe165d, 0xfffe165d, 0xfffe165d, 0x003fc000}},
  {{0xfffe60e2, 0xfffe60e2, 0xfffe60e2, 0x003fc000}},
  {{0xfffeab67, 0xfffeab67, 0xfffeab67, 0x003fc000}},
  {{0xfffef5ec, 0xfffef5ec, 0xfffef5ec, 0x003fc000}},
  {{0xffff4071, 0xffff4071, 0xffff4071, 0x003fc000}},
  {{0xffff8af6, 0xffff8af6, 0xffff8af6, 0x003fc000}},
  {{0xffffd57b, 0xffffd57b, 0xffffd57b, 0x003fc000}},
  {{0x00002000, 0x00002000, 0x00002000, 0x003fc000}},
  {{0x00006a85, 0x00006a85, 0x00006a85, 0x003fc000}},
  {{0x0000b50a, 0x0000b50a, 0x0000b50a, 0x003fc000}},
  {{0x0000ff8f, 0x0000ff8f, 0x0000ff8f, 0x003fc000}},
  {{0x00014a14, 0x00014a14, 0x00014a14, 0x003fc000}},
  {{0x00019499, 0x00019499, 0x00019499, 0x003fc000}},
  {{0x0001df1e, 0x0001df1e, 0x0001df1e, 0x003fc000}},
  {{0x000229a3, 0x000229a3, 0x000229a3, 0x003fc000}},
  {{0x00027428, 0x00027428, 0x00027428, 0x003fc000}},
  {{0x0002bead, 0x0002bead, 0x0002bead, 0x003fc000}},
  {{0x00030932, 0x00030932, 0x00030932, 0x003fc000}},
  {{0x000353b7, 0x000353b7, 0x000353b7, 0x003fc000}},
  {{0x00039e3c, 0x00039e3c, 0x00039e3c, 0x003fc000}},
  {{0x0003e8c1, 0x0003e8c1, 0x0003e8c1, 0x003fc000}},
  {{0x00043346, 0x00043346, 0x00043346, 0x003fc000}},
  {{0x00047dcb, 0x00047dcb, 0x00047dcb, 0x003fc000}},
  {{0x0004c850, 0x0004c850, 0x0004c850, 0x003fc000}},
  {{0x000512d5, 0x000512d5, 0x000512d5, 0x003fc000}},
  {{0x00055d5a, 0x00055d5a, 0x00055d5a, 0x003fc000}},
  {{0x0005a7df, 0x0005a7df, 0x0005a7df, 0x003fc000}},
  {{0x0005f264, 0x0005f264, 0x0005f264, 0x003fc000}},
  {{0x00063ce9, 0x00063ce9, 0x00063ce9, 0x003fc000}},
  {{0x0006876e, 0x0006876e, 0x0006876e, 0x003fc000}},
  {{0x0006d1f3, 0x0006d1f3, 0x0006d1f3, 0x003fc000}},
  {{0x00071c78, 0x00071c78, 0x00071c78, 0x003fc000}},
  {{0x000766fd, 0x000766fd, 0x000766fd, 0x003fc000}},
  {{0x0007b182, 0x0007b182, 0x0007b182, 0x003fc000}},
  {{0x0007fc07, 0x0007fc07, 0x0007fc07, 0x003fc000}},
  {{0x0008468c, 0x0008468c, 0x0008468c, 0x003fc000}},
  {{0x00089111, 0x00089111, 0x00089111, 0x003fc000}},
  {{0x0008db96, 0x0008db96, 0x0008db96, 0x003fc000}},
  {{0x0009261b, 0x0009261b, 0x0009261b, 0x003fc000}},
  {{0x000970a0, 0x000970a0, 0x000970a0, 0x003fc000}},
  {{0x0009bb25, 0x0009bb25, 0x0009bb25, 0x003fc000}},
  {{0x000a05aa, 0x000a05aa, 0x000a05aa, 0x003fc000}},
  {{0x000a502f, 0x000a502f, 0x000a502f, 0x003fc000}},
  {{0x000a9ab4, 0x000a9ab4, 0x000a9ab4, 0x003fc000}},
  {{0x000ae539, 0x000ae539, 0x000ae539, 0x003fc000}},
  {{0x000b2fbe, 0x000b2fbe, 0x000b2fbe, 0x003fc000}},
  {{0x000b7a43, 0x000b7a43, 0x000b7a43, 0x003fc000}},
  {{0x000bc4c8, 0x000bc4c8, 0x000bc4c8, 0x003fc000}},
  {{0x000c0f4d, 0x000c0f4d, 0x000c0f4d, 0x003fc000}},
  {{0x000c59d2, 0x000c59d2, 0x000c59d2, 0x003fc000}},
  {{0x000ca457, 0x000ca457, 0x000ca457, 0x003fc000}},
  {{0x000ceedc, 0x000ceedc, 0x000ceedc, 0x003fc000}},
  {{0x000d3961, 0x000d3961, 0x000d3961, 0x003fc000}},
  {{0x000d83e6, 0x000d83e6, 0x000d83e6, 0x003fc000}},
  {{0x000dce6b, 0x000dce6b, 0x000dce6b, 0x003fc000}},
  {{0x000e18f0, 0x000e18f0, 0x000e18f0, 0x003fc000}},
  {{0x000e6375, 0x000e6375, 0x000e6375, 0x003fc000}},
  {{0x000eadfa, 0x000eadfa, 0x000eadfa, 0x003fc000}},
  {{0x000ef87f, 0x000ef87f, 0x000ef87f, 0x003fc000}},
  {{0x000f4304, 0x000f4304, 0x000f4304, 0x003fc000}},
  {{0x000f8d89, 0x000f8d89, 0x000f8d89, 0x003fc000}},
  {{0x000fd80e, 0x000fd80e, 0x000fd80e, 0x003fc000}},
  {{0x00102293, 0x00102293, 0x00102293, 0x003fc000}},
  {{0x00106d18, 0x00106d18, 0x00106d18, 0x003fc000}},
  {{0x0010b79d, 0x0010b79d, 0x0010b79d, 0x003fc000}},
  {{0x00110222, 0x00110222, 0x00110222, 0x003fc000}},
  {{0x00114ca7, 0x00114ca7, 0x00114ca7, 0x003fc000}},
  {{0x0011972c, 0x0011972c, 0x0011972c, 0x003fc000}},
  {{0x0011e1b1, 0x0011e1b1, 0x0011e1b1, 0x003fc000}},
  {{0x00122c36, 0x00122c36, 0x00122c36, 0x003fc000}},
  {{0x001276bb, 0x001276bb, 0x001276bb, 0x003fc000}},
  {{0x0012c140, 0x0012c140, 0x0012c140, 0x003fc000}},
  {{0x00130bc5, 0x00130bc5, 0x00130bc5, 0x003fc000}},
  {{0x0013564a, 0x0013564a, 0x0013564a, 0x003fc000}},
  {{0x0013a0cf, 0x0013a0cf, 0x0013a0cf, 0x003fc000}},
  {{0x0013eb54, 0x0013eb54, 0x0013eb54, 0x003fc000}},
  {{0x001435d9, 0x001435d9, 0x001435d9, 0x003fc000}},
  {{0x0014805e, 0x0014805e, 0x0014805e, 0x003fc000}},
  {{0x0014cae3, 0x0014cae3, 0x0014cae3, 0x003fc000}},
  {{0x00151568, 0x00151568, 0x00151568, 0x003fc000}},
  {{0x00155fed, 0x00155fed, 0x00155fed, 0x003fc000}},
  {{0x0015aa72, 0x0015aa72, 0x0015aa72, 0x003fc000}},
  {{0x0015f4f7, 0x0015f4f7, 0x0015f4f7, 0x003fc000}},
  {{0x00163f7c, 0x00163f7c, 0x00163f7c, 0x003fc000}},
  {{0x00168a01, 0x00168a01, 0x00168a01, 0x003fc000}},
  {{0x0016d486, 0x0016d486, 0x0016d486, 0x003fc000}},
  {{0x00171f0b, 0x00171f0b, 0x00171f0b, 0x003fc000}},
  {{0x00176990, 0x00176990, 0x00176990, 0x003fc000}},
  {{0x0017b415, 0x0017b415, 0x0017b415, 0x003fc000}},
  {{0x0017fe9a, 0x0017fe9a, 0x0017fe9a, 0x003fc000}},
  {{0x0018491f, 0x0018491f, 0x0018491f, 0x003fc000}},
  {{0x001893a4, 0x001893a4, 0x001893a4, 0x003fc000}},
  {{0x0018de29, 0x0018de29, 0x0018de29, 0x003fc000}},
  {{0x001928ae, 0x001928ae, 0x001928ae, 0x003fc000}},
  {{0x00197333, 0x00197333, 0x00197333, 0x003fc000}},
  {{0x0019bdb8, 0x0019bdb8, 0x0019bdb8, 0x003fc000}},
  {{0x001a083d, 0x001a083d, 0x001a083d, 0x003fc000}},
  {{0x001a52c2, 0x001a52c2, 0x001a52c2, 0x003fc000}},
  {{0x001a9d47, 0x001a9d47, 0x001a9d47, 0x003fc000}},
  {{0x001ae7cc, 0x001ae7cc, 0x001ae7cc, 0x003fc000}},
  {{0x001b3251, 0x001b3251, 0x001b3251, 0x003fc000}},
  {{0x001b7cd6, 0x001b7cd6, 0x001b7cd6, 0x003fc000}},
  {{0x001bc75b, 0x001bc75b, 0x001bc75b, 0x003fc000}},
  {{0x001c11e0, 0x001c11e0, 0x001c11e0, 0x003fc000}},
  {{0x001c5c65, 0x001c5c65, 0x001c5c65, 0x003fc000}},
  {{0x001ca6ea, 0x001ca6ea, 0x001ca6ea, 0x003fc000}},
  {{0x001cf16f, 0x001cf16f, 0x001cf16f, 0x003fc000}},
  {{0x001d3bf4, 0x001d3bf4, 0x001d3bf4, 0x003fc000}},
  {{0x001d8679, 0x001d8679, 0x001d8679, 0x003fc000}},
  {{0x001dd0fe, 0x001dd0fe, 0x001dd0fe, 0x003fc000}},
  {{0x001e1b83, 0x001e1b83, 0x001e1b83, 0x003fc000}},
  {{0x001e6608, 0x001e6608, 0x001e6608, 0x003fc000}},
  {{0x001eb08d, 0x001eb08d, 0x001eb08d, 0x003fc000}},
  {{0x001efb12, 0x001efb12, 0x001efb12, 0x003fc000}},
  {{0x001f4597, 0x001f4597, 0x001f4597, 0x003fc000}},
  {{0x001f901c, 0x001f901c, 0x001f901c, 0x003fc000}},
  {{0x001fdaa1, 0x001fdaa1, 0x001fdaa1, 0x003fc000}},
  {{0x00202526, 0x00202526, 0x00202526, 0x003fc000}},
  {{0x00206fab, 0x00206fab, 0x00206fab, 0x003fc000}},
  {{0x0020ba30, 0x0020ba30, 0x0020ba30, 0x003fc000}},
  {{0x002104b5, 0x002104b5, 0x002104b5, 0x003fc000}},
  {{0x00214f3a, 0x00214f3a, 0x00214f3a, 0x003fc000}},
  {{0x002199bf, 0x002199bf, 0x002199bf, 0x003fc000}},
  {{0x0021e444, 0x0021e444, 0x0021e444, 0x003fc000}},
  {{0x00222ec9, 0x00222ec9, 0x00222ec9, 0x003fc000}},
  {{0x0022794e, 0x0022794e, 0x0022794e, 0x003fc000}},
  {{0x0022c3d3, 0x0022c3d3, 0x0022c3d3, 0x003fc000}},
  {{0x00230e58, 0x00230e58, 0x00230e58, 0x003fc000}},
  {{0x002358dd, 0x002358dd, 0x002358dd, 0x003fc000}},
  {{0x0023a362, 0x0023a362, 0x0023a362, 0x003fc000}},
  {{0x0023ede7, 0x0023ede7, 0x0023ede7, 0x003fc000}},
  {{0x0024386c, 0x0024386c, 0x0024386c, 0x003fc000}},
  {{0x002482f1, 0x002482f1, 0x002482f1, 0x003fc000}},
  {{0x0024cd76, 0x0024cd76, 0x0024cd76, 0x003fc000}},
  {{0x002517fb, 0x002517fb, 0x002517fb, 0x003fc000}},
  {{0x00256280, 0x00256280, 0x00256280, 0x003fc000}},
  {{0x0025ad05, 0x0025ad05, 0x0025ad05, 0x003fc000}},
  {{0x0025f78a, 0x0025f78a, 0x0025f78a, 0x003fc000}},
  {{0x0026420f, 0x0026420f, 0x0026420f, 0x003fc000}},
  {{0x00268c94, 0x00268c94, 0x00268c94, 0x003fc000}},
  {{0x0026d719, 0x0026d719, 0x0026d719, 0x003fc000}},
  {{0x0027219e, 0x0027219e, 0x0027219e, 0x003fc000}},
  {{0x00276c23, 0x00276c23, 0x00276c23, 0x003fc000}},
  {{0x0027b6a8, 0x0027b6a8, 0x0027b6a8, 0x003fc000}},
  {{0x0028012d, 0x0028012d, 0x0028012d, 0x003fc000}},
  {{0x00284bb2, 0x00284bb2, 0x00284bb2, 0x003fc000}},
  {{0x00289637, 0x00289637, 0x00289637, 0x003fc000}},
  {{0x0028e0bc, 0x0028e0bc, 0x0028e0bc, 0x003fc000}},
  {{0x00292b41, 0x00292b41, 0x00292b41, 0x003fc000}},
  {{0x002975c6, 0x002975c6, 0x002975c6, 0x003fc000}},
  {{0x0029c04b, 0x0029c04b, 0x0029c04b, 0x003fc000}},
  {{0x002a0ad0, 0x002a0ad0, 0x002a0ad0, 0x003fc000}},
  {{0x002a5555, 0x002a5555, 0x002a5555, 0x003fc000}},
  {{0x002a9fda, 0x002a9fda, 0x002a9fda, 0x003fc000}},
  {{0x002aea5f, 0x002aea5f, 0x002aea5f, 0x003fc000}},
  {{0x002b34e4, 0x002b34e4, 0x002b34e4, 0x003fc000}},
  {{0x002b7f69, 0x002b7f69, 0x002b7f69, 0x003fc000}},
  {{0x002bc9ee, 0x002bc9ee, 0x002bc9ee, 0x003fc000}},
  {{0x002c1473, 0x002c1473, 0x002c1473, 0x003fc000}},
  {{0x002c5ef8, 0x002c5ef8, 0x002c5ef8, 0x003fc000}},
  {{0x002ca97d, 0x002ca97d, 0x002ca97d, 0x003fc000}},
  {{0x002cf402, 0x002cf402, 0x002cf402, 0x003fc000}},
  {{0x002d3e87, 0x002d3e87, 0x002d3e87, 0x003fc000}},
  {{0x002d890c, 0x002d890c, 0x002d890c, 0x003fc000}},
  {{0x002dd391, 0x002dd391, 0x002dd391, 0x003fc000}},
  {{0x002e1e16, 0x002e1e16, 0x002e1e16, 0x003fc000}},
  {{0x002e689b, 0x002e689b, 0x002e689b, 0x003fc000}},
  {{0x002eb320, 0x002eb320, 0x002eb320, 0x003fc000}},
  {{0x002efda5, 0x002efda5, 0x002efda5, 0x003fc000}},
  {{0x002f482a, 0x002f482a, 0x002f482a, 0x003fc000}},
  {{0x002f92af, 0x002f92af, 0x002f92af, 0x003fc000}},
  {{0x002fdd34, 0x002fdd34, 0x002fdd34, 0x003fc000}},
  {{0x003027b9, 0x003027b9, 0x003027b9, 0x003fc000}},
  {{0x0030723e, 0x0030723e, 0x0030723e, 0x003fc000}},
  {{0x0030bcc3, 0x0030bcc3, 0x0030bcc3, 0x003fc000}},
  {{0x00310748, 0x00310748, 0x00310748, 0x003fc000}},
  {{0x003151cd, 0x003151cd, 0x003151cd, 0x003fc000}},
  {{0x00319c52, 0x00319c52, 0x00319c52, 0x003fc000}},
  {{0x0031e6d7, 0x0031e6d7, 0x0031e6d7, 0x003fc000}},
  {{0x0032315c, 0x0032315c, 0x0032315c, 0x003fc000}},
  {{0x00327be1, 0x00327be1, 0x00327be1, 0x003fc000}},
  {{0x0032c666, 0x0032c666, 0x0032c666, 0x003fc000}},
  {{0x003310eb, 0x003310eb, 0x003310eb, 0x003fc000}},
  {{0x00335b70, 0x00335b70, 0x00335b70, 0x003fc000}},
  {{0x0033a5f5, 0x0033a5f5, 0x0033a5f5, 0x003fc000}},
  {{0x0033f07a, 0x0033f07a, 0x0033f07a, 0x003fc000}},
  {{0x00343aff, 0x00343aff, 0x00343aff, 0x003fc000}},
  {{0x00348584, 0x00348584, 0x00348584, 0x003fc000}},
  {{0x0034d009, 0x0034d009, 0x0034d009, 0x003fc000}},
  {{0x00351a8e, 0x00351a8e, 0x00351a8e, 0x003fc000}},
  {{0x00356513, 0x00356513, 0x00356513, 0x003fc000}},
  {{0x0035af98, 0x0035af98, 0x0035af98, 0x003fc000}},
  {{0x0035fa1d, 0x0035fa1d, 0x0035fa1d, 0x003fc000}},
  {{0x003644a2, 0x003644a2, 0x003644a2, 0x003fc000}},
  {{0x00368f27, 0x00368f27, 0x00368f27, 0x003fc000}},
  {{0x0036d9ac, 0x0036d9ac, 0x0036d9ac, 0x003fc000}},
  {{0x00372431, 0x00372431, 0x00372431, 0x003fc000}},
  {{0x00376eb6, 0x00376eb6, 0x00376eb6, 0x003fc000}},
  {{0x0037b93b, 0x0037b93b, 0x0037b93b, 0x003fc000}},
  {{0x003803c0, 0x003803c0, 0x003803c0, 0x003fc000}},
  {{0x00384e45, 0x00384e45, 0x00384e45, 0x003fc000}},
  {{0x003898ca, 0x003898ca, 0x003898ca, 0x003fc000}},
  {{0x0038e34f, 0x0038e34f, 0x0038e34f, 0x003fc000}},
  {{0x00392dd4, 0x00392dd4, 0x00392dd4, 0x003fc000}},
  {{0x00397859, 0x00397859, 0x00397859, 0x003fc000}},
  {{0x0039c2de, 0x0039c2de, 0x0039c2de, 0x003fc000}},
  {{0x003a0d63, 0x003a0d63, 0x003a0d63, 0x003fc000}},
  {{0x003a57e8, 0x003a57e8, 0x003a57e8, 0x003fc000}},
  {{0x003aa26d, 0x003aa26d, 0x003aa26d, 0x003fc000}},
  {{0x003aecf2, 0x003aecf2, 0x003aecf2, 0x003fc000}},
  {{0x003b3777, 0x003b3777, 0x003b3777, 0x003fc000}},
  {{0x003b81fc, 0x003b81fc, 0x003b81fc, 0x003fc000}},
  {{0x003bcc81, 0x003bcc81, 0x003bcc81, 0x003fc000}},
  {{0x003c1706, 0x003c1706, 0x003c1706, 0x003fc000}},
  {{0x003c618b, 0x003c618b, 0x003c618b, 0x003fc000}},
  {{0x003cac10, 0x003cac10, 0x003cac10, 0x003fc000}},
  {{0x003cf695, 0x003cf695, 0x003cf695, 0x003fc000}},
  {{0x003d411a, 0x003d411a, 0x003d411a, 0x003fc000}},
  {{0x003d8b9f, 0x003d8b9f, 0x003d8b9f, 0x003fc000}},
  {{0x003dd624, 0x003dd624, 0x003dd624, 0x003fc000}},
  {{0x003e20a9, 0x003e20a9, 0x003e20a9, 0x003fc000}},
  {{0x003e6b2e, 0x003e6b2e, 0x003e6b2e, 0x003fc000}},
  {{0x003eb5b3, 0x003eb5b3, 0x003eb5b3, 0x003fc000}},
  {{0x003f0038, 0x003f0038, 0x003f0038, 0x003fc000}},
  {{0x003f4abd, 0x003f4abd, 0x003f4abd, 0x003fc000}},
  {{0x003f9542, 0x003f9542, 0x003f9542, 0x003fc000}},
  {{0x003fdfc7, 0x003fdfc7, 0x003fdfc7, 0x003fc000}},
  {{0x00402a4c, 0x00402a4c, 0x00402a4c, 0x003fc000}},
  {{0x004074d1, 0x004074d1, 0x004074d1, 0x003fc000}},
  {{0x0040bf56, 0x0040bf56, 0x0040bf56, 0x003fc000}},
  {{0x004109db, 0x004109db, 0x004109db, 0x003fc000}},
  {{0x00415460, 0x00415460, 0x00415460, 0x003fc000}},
  {{0x00419ee5, 0x00419ee5, 0x00419ee5, 0x003fc000}},
  {{0x0041e96a, 0x0041e96a, 0x0041e96a, 0x003fc000}},
  {{0x004233ef, 0x004233ef, 0x004233ef, 0x003fc000}},
  {{0x00427e74, 0x00427e74, 0x00427e74, 0x003fc000}},
  {{0x0042c8f9, 0x0042c8f9, 0x0042c8f9, 0x003fc000}},
  {{0x0043137e, 0x0043137e, 0x0043137e, 0x003fc000}},
  {{0x00435e03, 0x00435e03, 0x00435e03, 0x003fc000}},
  {{0x0043a888, 0x0043a888, 0x0043a888, 0x003fc000}},
  {{0x0043f30d, 0x0043f30d, 0x0043f30d, 0x003fc000}},
  {{0x00443d92, 0x00443d92, 0x00443d92, 0x003fc000}},
  {{0x00448817, 0x00448817, 0x00448817, 0x003fc000}},
  {{0x0044d29c, 0x0044d29c, 0x0044d29c, 0x003fc000}},
  {{0x00451d21, 0x00451d21, 0x00451d21, 0x003fc000}},
  {{0x004567a6, 0x004567a6, 0x004567a6, 0x003fc000}},
  {{0x0045b22b, 0x0045b22b, 0x0045b22b, 0x003fc000}}
 };
 static const VP8kCstSSE2 VP8kUtoRGBA[256] = {
  {{0, 0x000c8980, 0xffbf7300, 0}}, {{0, 0x000c706d, 0xffbff41a, 0}},
  {{0, 0x000c575a, 0xffc07534, 0}}, {{0, 0x000c3e47, 0xffc0f64e, 0}},
  {{0, 0x000c2534, 0xffc17768, 0}}, {{0, 0x000c0c21, 0xffc1f882, 0}},
  {{0, 0x000bf30e, 0xffc2799c, 0}}, {{0, 0x000bd9fb, 0xffc2fab6, 0}},
  {{0, 0x000bc0e8, 0xffc37bd0, 0}}, {{0, 0x000ba7d5, 0xffc3fcea, 0}},
  {{0, 0x000b8ec2, 0xffc47e04, 0}}, {{0, 0x000b75af, 0xffc4ff1e, 0}},
  {{0, 0x000b5c9c, 0xffc58038, 0}}, {{0, 0x000b4389, 0xffc60152, 0}},
  {{0, 0x000b2a76, 0xffc6826c, 0}}, {{0, 0x000b1163, 0xffc70386, 0}},
  {{0, 0x000af850, 0xffc784a0, 0}}, {{0, 0x000adf3d, 0xffc805ba, 0}},
  {{0, 0x000ac62a, 0xffc886d4, 0}}, {{0, 0x000aad17, 0xffc907ee, 0}},
  {{0, 0x000a9404, 0xffc98908, 0}}, {{0, 0x000a7af1, 0xffca0a22, 0}},
  {{0, 0x000a61de, 0xffca8b3c, 0}}, {{0, 0x000a48cb, 0xffcb0c56, 0}},
  {{0, 0x000a2fb8, 0xffcb8d70, 0}}, {{0, 0x000a16a5, 0xffcc0e8a, 0}},
  {{0, 0x0009fd92, 0xffcc8fa4, 0}}, {{0, 0x0009e47f, 0xffcd10be, 0}},
  {{0, 0x0009cb6c, 0xffcd91d8, 0}}, {{0, 0x0009b259, 0xffce12f2, 0}},
  {{0, 0x00099946, 0xffce940c, 0}}, {{0, 0x00098033, 0xffcf1526, 0}},
  {{0, 0x00096720, 0xffcf9640, 0}}, {{0, 0x00094e0d, 0xffd0175a, 0}},
  {{0, 0x000934fa, 0xffd09874, 0}}, {{0, 0x00091be7, 0xffd1198e, 0}},
  {{0, 0x000902d4, 0xffd19aa8, 0}}, {{0, 0x0008e9c1, 0xffd21bc2, 0}},
  {{0, 0x0008d0ae, 0xffd29cdc, 0}}, {{0, 0x0008b79b, 0xffd31df6, 0}},
  {{0, 0x00089e88, 0xffd39f10, 0}}, {{0, 0x00088575, 0xffd4202a, 0}},
  {{0, 0x00086c62, 0xffd4a144, 0}}, {{0, 0x0008534f, 0xffd5225e, 0}},
  {{0, 0x00083a3c, 0xffd5a378, 0}}, {{0, 0x00082129, 0xffd62492, 0}},
  {{0, 0x00080816, 0xffd6a5ac, 0}}, {{0, 0x0007ef03, 0xffd726c6, 0}},
  {{0, 0x0007d5f0, 0xffd7a7e0, 0}}, {{0, 0x0007bcdd, 0xffd828fa, 0}},
  {{0, 0x0007a3ca, 0xffd8aa14, 0}}, {{0, 0x00078ab7, 0xffd92b2e, 0}},
  {{0, 0x000771a4, 0xffd9ac48, 0}}, {{0, 0x00075891, 0xffda2d62, 0}},
  {{0, 0x00073f7e, 0xffdaae7c, 0}}, {{0, 0x0007266b, 0xffdb2f96, 0}},
  {{0, 0x00070d58, 0xffdbb0b0, 0}}, {{0, 0x0006f445, 0xffdc31ca, 0}},
  {{0, 0x0006db32, 0xffdcb2e4, 0}}, {{0, 0x0006c21f, 0xffdd33fe, 0}},
  {{0, 0x0006a90c, 0xffddb518, 0}}, {{0, 0x00068ff9, 0xffde3632, 0}},
  {{0, 0x000676e6, 0xffdeb74c, 0}}, {{0, 0x00065dd3, 0xffdf3866, 0}},
  {{0, 0x000644c0, 0xffdfb980, 0}}, {{0, 0x00062bad, 0xffe03a9a, 0}},
  {{0, 0x0006129a, 0xffe0bbb4, 0}}, {{0, 0x0005f987, 0xffe13cce, 0}},
  {{0, 0x0005e074, 0xffe1bde8, 0}}, {{0, 0x0005c761, 0xffe23f02, 0}},
  {{0, 0x0005ae4e, 0xffe2c01c, 0}}, {{0, 0x0005953b, 0xffe34136, 0}},
  {{0, 0x00057c28, 0xffe3c250, 0}}, {{0, 0x00056315, 0xffe4436a, 0}},
  {{0, 0x00054a02, 0xffe4c484, 0}}, {{0, 0x000530ef, 0xffe5459e, 0}},
  {{0, 0x000517dc, 0xffe5c6b8, 0}}, {{0, 0x0004fec9, 0xffe647d2, 0}},
  {{0, 0x0004e5b6, 0xffe6c8ec, 0}}, {{0, 0x0004cca3, 0xffe74a06, 0}},
  {{0, 0x0004b390, 0xffe7cb20, 0}}, {{0, 0x00049a7d, 0xffe84c3a, 0}},
  {{0, 0x0004816a, 0xffe8cd54, 0}}, {{0, 0x00046857, 0xffe94e6e, 0}},
  {{0, 0x00044f44, 0xffe9cf88, 0}}, {{0, 0x00043631, 0xffea50a2, 0}},
  {{0, 0x00041d1e, 0xffead1bc, 0}}, {{0, 0x0004040b, 0xffeb52d6, 0}},
  {{0, 0x0003eaf8, 0xffebd3f0, 0}}, {{0, 0x0003d1e5, 0xffec550a, 0}},
  {{0, 0x0003b8d2, 0xffecd624, 0}}, {{0, 0x00039fbf, 0xffed573e, 0}},
  {{0, 0x000386ac, 0xffedd858, 0}}, {{0, 0x00036d99, 0xffee5972, 0}},
  {{0, 0x00035486, 0xffeeda8c, 0}}, {{0, 0x00033b73, 0xffef5ba6, 0}},
  {{0, 0x00032260, 0xffefdcc0, 0}}, {{0, 0x0003094d, 0xfff05dda, 0}},
  {{0, 0x0002f03a, 0xfff0def4, 0}}, {{0, 0x0002d727, 0xfff1600e, 0}},
  {{0, 0x0002be14, 0xfff1e128, 0}}, {{0, 0x0002a501, 0xfff26242, 0}},
  {{0, 0x00028bee, 0xfff2e35c, 0}}, {{0, 0x000272db, 0xfff36476, 0}},
  {{0, 0x000259c8, 0xfff3e590, 0}}, {{0, 0x000240b5, 0xfff466aa, 0}},
  {{0, 0x000227a2, 0xfff4e7c4, 0}}, {{0, 0x00020e8f, 0xfff568de, 0}},
  {{0, 0x0001f57c, 0xfff5e9f8, 0}}, {{0, 0x0001dc69, 0xfff66b12, 0}},
  {{0, 0x0001c356, 0xfff6ec2c, 0}}, {{0, 0x0001aa43, 0xfff76d46, 0}},
  {{0, 0x00019130, 0xfff7ee60, 0}}, {{0, 0x0001781d, 0xfff86f7a, 0}},
  {{0, 0x00015f0a, 0xfff8f094, 0}}, {{0, 0x000145f7, 0xfff971ae, 0}},
  {{0, 0x00012ce4, 0xfff9f2c8, 0}}, {{0, 0x000113d1, 0xfffa73e2, 0}},
  {{0, 0x0000fabe, 0xfffaf4fc, 0}}, {{0, 0x0000e1ab, 0xfffb7616, 0}},
  {{0, 0x0000c898, 0xfffbf730, 0}}, {{0, 0x0000af85, 0xfffc784a, 0}},
  {{0, 0x00009672, 0xfffcf964, 0}}, {{0, 0x00007d5f, 0xfffd7a7e, 0}},
  {{0, 0x0000644c, 0xfffdfb98, 0}}, {{0, 0x00004b39, 0xfffe7cb2, 0}},
  {{0, 0x00003226, 0xfffefdcc, 0}}, {{0, 0x00001913, 0xffff7ee6, 0}},
  {{0, 0x00000000, 0x00000000, 0}}, {{0, 0xffffe6ed, 0x0000811a, 0}},
  {{0, 0xffffcdda, 0x00010234, 0}}, {{0, 0xffffb4c7, 0x0001834e, 0}},
  {{0, 0xffff9bb4, 0x00020468, 0}}, {{0, 0xffff82a1, 0x00028582, 0}},
  {{0, 0xffff698e, 0x0003069c, 0}}, {{0, 0xffff507b, 0x000387b6, 0}},
  {{0, 0xffff3768, 0x000408d0, 0}}, {{0, 0xffff1e55, 0x000489ea, 0}},
  {{0, 0xffff0542, 0x00050b04, 0}}, {{0, 0xfffeec2f, 0x00058c1e, 0}},
  {{0, 0xfffed31c, 0x00060d38, 0}}, {{0, 0xfffeba09, 0x00068e52, 0}},
  {{0, 0xfffea0f6, 0x00070f6c, 0}}, {{0, 0xfffe87e3, 0x00079086, 0}},
  {{0, 0xfffe6ed0, 0x000811a0, 0}}, {{0, 0xfffe55bd, 0x000892ba, 0}},
  {{0, 0xfffe3caa, 0x000913d4, 0}}, {{0, 0xfffe2397, 0x000994ee, 0}},
  {{0, 0xfffe0a84, 0x000a1608, 0}}, {{0, 0xfffdf171, 0x000a9722, 0}},
  {{0, 0xfffdd85e, 0x000b183c, 0}}, {{0, 0xfffdbf4b, 0x000b9956, 0}},
  {{0, 0xfffda638, 0x000c1a70, 0}}, {{0, 0xfffd8d25, 0x000c9b8a, 0}},
  {{0, 0xfffd7412, 0x000d1ca4, 0}}, {{0, 0xfffd5aff, 0x000d9dbe, 0}},
  {{0, 0xfffd41ec, 0x000e1ed8, 0}}, {{0, 0xfffd28d9, 0x000e9ff2, 0}},
  {{0, 0xfffd0fc6, 0x000f210c, 0}}, {{0, 0xfffcf6b3, 0x000fa226, 0}},
  {{0, 0xfffcdda0, 0x00102340, 0}}, {{0, 0xfffcc48d, 0x0010a45a, 0}},
  {{0, 0xfffcab7a, 0x00112574, 0}}, {{0, 0xfffc9267, 0x0011a68e, 0}},
  {{0, 0xfffc7954, 0x001227a8, 0}}, {{0, 0xfffc6041, 0x0012a8c2, 0}},
  {{0, 0xfffc472e, 0x001329dc, 0}}, {{0, 0xfffc2e1b, 0x0013aaf6, 0}},
  {{0, 0xfffc1508, 0x00142c10, 0}}, {{0, 0xfffbfbf5, 0x0014ad2a, 0}},
  {{0, 0xfffbe2e2, 0x00152e44, 0}}, {{0, 0xfffbc9cf, 0x0015af5e, 0}},
  {{0, 0xfffbb0bc, 0x00163078, 0}}, {{0, 0xfffb97a9, 0x0016b192, 0}},
  {{0, 0xfffb7e96, 0x001732ac, 0}}, {{0, 0xfffb6583, 0x0017b3c6, 0}},
  {{0, 0xfffb4c70, 0x001834e0, 0}}, {{0, 0xfffb335d, 0x0018b5fa, 0}},
  {{0, 0xfffb1a4a, 0x00193714, 0}}, {{0, 0xfffb0137, 0x0019b82e, 0}},
  {{0, 0xfffae824, 0x001a3948, 0}}, {{0, 0xfffacf11, 0x001aba62, 0}},
  {{0, 0xfffab5fe, 0x001b3b7c, 0}}, {{0, 0xfffa9ceb, 0x001bbc96, 0}},
  {{0, 0xfffa83d8, 0x001c3db0, 0}}, {{0, 0xfffa6ac5, 0x001cbeca, 0}},
  {{0, 0xfffa51b2, 0x001d3fe4, 0}}, {{0, 0xfffa389f, 0x001dc0fe, 0}},
  {{0, 0xfffa1f8c, 0x001e4218, 0}}, {{0, 0xfffa0679, 0x001ec332, 0}},
  {{0, 0xfff9ed66, 0x001f444c, 0}}, {{0, 0xfff9d453, 0x001fc566, 0}},
  {{0, 0xfff9bb40, 0x00204680, 0}}, {{0, 0xfff9a22d, 0x0020c79a, 0}},
  {{0, 0xfff9891a, 0x002148b4, 0}}, {{0, 0xfff97007, 0x0021c9ce, 0}},
  {{0, 0xfff956f4, 0x00224ae8, 0}}, {{0, 0xfff93de1, 0x0022cc02, 0}},
  {{0, 0xfff924ce, 0x00234d1c, 0}}, {{0, 0xfff90bbb, 0x0023ce36, 0}},
  {{0, 0xfff8f2a8, 0x00244f50, 0}}, {{0, 0xfff8d995, 0x0024d06a, 0}},
  {{0, 0xfff8c082, 0x00255184, 0}}, {{0, 0xfff8a76f, 0x0025d29e, 0}},
  {{0, 0xfff88e5c, 0x002653b8, 0}}, {{0, 0xfff87549, 0x0026d4d2, 0}},
  {{0, 0xfff85c36, 0x002755ec, 0}}, {{0, 0xfff84323, 0x0027d706, 0}},
  {{0, 0xfff82a10, 0x00285820, 0}}, {{0, 0xfff810fd, 0x0028d93a, 0}},
  {{0, 0xfff7f7ea, 0x00295a54, 0}}, {{0, 0xfff7ded7, 0x0029db6e, 0}},
  {{0, 0xfff7c5c4, 0x002a5c88, 0}}, {{0, 0xfff7acb1, 0x002adda2, 0}},
  {{0, 0xfff7939e, 0x002b5ebc, 0}}, {{0, 0xfff77a8b, 0x002bdfd6, 0}},
  {{0, 0xfff76178, 0x002c60f0, 0}}, {{0, 0xfff74865, 0x002ce20a, 0}},
  {{0, 0xfff72f52, 0x002d6324, 0}}, {{0, 0xfff7163f, 0x002de43e, 0}},
  {{0, 0xfff6fd2c, 0x002e6558, 0}}, {{0, 0xfff6e419, 0x002ee672, 0}},
  {{0, 0xfff6cb06, 0x002f678c, 0}}, {{0, 0xfff6b1f3, 0x002fe8a6, 0}},
  {{0, 0xfff698e0, 0x003069c0, 0}}, {{0, 0xfff67fcd, 0x0030eada, 0}},
  {{0, 0xfff666ba, 0x00316bf4, 0}}, {{0, 0xfff64da7, 0x0031ed0e, 0}},
  {{0, 0xfff63494, 0x00326e28, 0}}, {{0, 0xfff61b81, 0x0032ef42, 0}},
  {{0, 0xfff6026e, 0x0033705c, 0}}, {{0, 0xfff5e95b, 0x0033f176, 0}},
  {{0, 0xfff5d048, 0x00347290, 0}}, {{0, 0xfff5b735, 0x0034f3aa, 0}},
  {{0, 0xfff59e22, 0x003574c4, 0}}, {{0, 0xfff5850f, 0x0035f5de, 0}},
  {{0, 0xfff56bfc, 0x003676f8, 0}}, {{0, 0xfff552e9, 0x0036f812, 0}},
  {{0, 0xfff539d6, 0x0037792c, 0}}, {{0, 0xfff520c3, 0x0037fa46, 0}},
  {{0, 0xfff507b0, 0x00387b60, 0}}, {{0, 0xfff4ee9d, 0x0038fc7a, 0}},
  {{0, 0xfff4d58a, 0x00397d94, 0}}, {{0, 0xfff4bc77, 0x0039feae, 0}},
  {{0, 0xfff4a364, 0x003a7fc8, 0}}, {{0, 0xfff48a51, 0x003b00e2, 0}},
  {{0, 0xfff4713e, 0x003b81fc, 0}}, {{0, 0xfff4582b, 0x003c0316, 0}},
  {{0, 0xfff43f18, 0x003c8430, 0}}, {{0, 0xfff42605, 0x003d054a, 0}},
  {{0, 0xfff40cf2, 0x003d8664, 0}}, {{0, 0xfff3f3df, 0x003e077e, 0}},
  {{0, 0xfff3dacc, 0x003e8898, 0}}, {{0, 0xfff3c1b9, 0x003f09b2, 0}},
  {{0, 0xfff3a8a6, 0x003f8acc, 0}}, {{0, 0xfff38f93, 0x00400be6, 0}}
 };
 static VP8kCstSSE2 VP8kVtoRGBA[256] = {
  {{0xffcced80, 0x001a0400, 0, 0}}, {{0xffcd53a5, 0x0019cff8, 0, 0}},
  {{0xffcdb9ca, 0x00199bf0, 0, 0}}, {{0xffce1fef, 0x001967e8, 0, 0}},
  {{0xffce8614, 0x001933e0, 0, 0}}, {{0xffceec39, 0x0018ffd8, 0, 0}},
  {{0xffcf525e, 0x0018cbd0, 0, 0}}, {{0xffcfb883, 0x001897c8, 0, 0}},
  {{0xffd01ea8, 0x001863c0, 0, 0}}, {{0xffd084cd, 0x00182fb8, 0, 0}},
  {{0xffd0eaf2, 0x0017fbb0, 0, 0}}, {{0xffd15117, 0x0017c7a8, 0, 0}},
  {{0xffd1b73c, 0x001793a0, 0, 0}}, {{0xffd21d61, 0x00175f98, 0, 0}},
  {{0xffd28386, 0x00172b90, 0, 0}}, {{0xffd2e9ab, 0x0016f788, 0, 0}},
  {{0xffd34fd0, 0x0016c380, 0, 0}}, {{0xffd3b5f5, 0x00168f78, 0, 0}},
  {{0xffd41c1a, 0x00165b70, 0, 0}}, {{0xffd4823f, 0x00162768, 0, 0}},
  {{0xffd4e864, 0x0015f360, 0, 0}}, {{0xffd54e89, 0x0015bf58, 0, 0}},
  {{0xffd5b4ae, 0x00158b50, 0, 0}}, {{0xffd61ad3, 0x00155748, 0, 0}},
  {{0xffd680f8, 0x00152340, 0, 0}}, {{0xffd6e71d, 0x0014ef38, 0, 0}},
  {{0xffd74d42, 0x0014bb30, 0, 0}}, {{0xffd7b367, 0x00148728, 0, 0}},
  {{0xffd8198c, 0x00145320, 0, 0}}, {{0xffd87fb1, 0x00141f18, 0, 0}},
  {{0xffd8e5d6, 0x0013eb10, 0, 0}}, {{0xffd94bfb, 0x0013b708, 0, 0}},
  {{0xffd9b220, 0x00138300, 0, 0}}, {{0xffda1845, 0x00134ef8, 0, 0}},
  {{0xffda7e6a, 0x00131af0, 0, 0}}, {{0xffdae48f, 0x0012e6e8, 0, 0}},
  {{0xffdb4ab4, 0x0012b2e0, 0, 0}}, {{0xffdbb0d9, 0x00127ed8, 0, 0}},
  {{0xffdc16fe, 0x00124ad0, 0, 0}}, {{0xffdc7d23, 0x001216c8, 0, 0}},
  {{0xffdce348, 0x0011e2c0, 0, 0}}, {{0xffdd496d, 0x0011aeb8, 0, 0}},
  {{0xffddaf92, 0x00117ab0, 0, 0}}, {{0xffde15b7, 0x001146a8, 0, 0}},
  {{0xffde7bdc, 0x001112a0, 0, 0}}, {{0xffdee201, 0x0010de98, 0, 0}},
  {{0xffdf4826, 0x0010aa90, 0, 0}}, {{0xffdfae4b, 0x00107688, 0, 0}},
  {{0xffe01470, 0x00104280, 0, 0}}, {{0xffe07a95, 0x00100e78, 0, 0}},
  {{0xffe0e0ba, 0x000fda70, 0, 0}}, {{0xffe146df, 0x000fa668, 0, 0}},
  {{0xffe1ad04, 0x000f7260, 0, 0}}, {{0xffe21329, 0x000f3e58, 0, 0}},
  {{0xffe2794e, 0x000f0a50, 0, 0}}, {{0xffe2df73, 0x000ed648, 0, 0}},
  {{0xffe34598, 0x000ea240, 0, 0}}, {{0xffe3abbd, 0x000e6e38, 0, 0}},
  {{0xffe411e2, 0x000e3a30, 0, 0}}, {{0xffe47807, 0x000e0628, 0, 0}},
  {{0xffe4de2c, 0x000dd220, 0, 0}}, {{0xffe54451, 0x000d9e18, 0, 0}},
  {{0xffe5aa76, 0x000d6a10, 0, 0}}, {{0xffe6109b, 0x000d3608, 0, 0}},
  {{0xffe676c0, 0x000d0200, 0, 0}}, {{0xffe6dce5, 0x000ccdf8, 0, 0}},
  {{0xffe7430a, 0x000c99f0, 0, 0}}, {{0xffe7a92f, 0x000c65e8, 0, 0}},
  {{0xffe80f54, 0x000c31e0, 0, 0}}, {{0xffe87579, 0x000bfdd8, 0, 0}},
  {{0xffe8db9e, 0x000bc9d0, 0, 0}}, {{0xffe941c3, 0x000b95c8, 0, 0}},
  {{0xffe9a7e8, 0x000b61c0, 0, 0}}, {{0xffea0e0d, 0x000b2db8, 0, 0}},
  {{0xffea7432, 0x000af9b0, 0, 0}}, {{0xffeada57, 0x000ac5a8, 0, 0}},
  {{0xffeb407c, 0x000a91a0, 0, 0}}, {{0xffeba6a1, 0x000a5d98, 0, 0}},
  {{0xffec0cc6, 0x000a2990, 0, 0}}, {{0xffec72eb, 0x0009f588, 0, 0}},
  {{0xffecd910, 0x0009c180, 0, 0}}, {{0xffed3f35, 0x00098d78, 0, 0}},
  {{0xffeda55a, 0x00095970, 0, 0}}, {{0xffee0b7f, 0x00092568, 0, 0}},
  {{0xffee71a4, 0x0008f160, 0, 0}}, {{0xffeed7c9, 0x0008bd58, 0, 0}},
  {{0xffef3dee, 0x00088950, 0, 0}}, {{0xffefa413, 0x00085548, 0, 0}},
  {{0xfff00a38, 0x00082140, 0, 0}}, {{0xfff0705d, 0x0007ed38, 0, 0}},
  {{0xfff0d682, 0x0007b930, 0, 0}}, {{0xfff13ca7, 0x00078528, 0, 0}},
  {{0xfff1a2cc, 0x00075120, 0, 0}}, {{0xfff208f1, 0x00071d18, 0, 0}},
  {{0xfff26f16, 0x0006e910, 0, 0}}, {{0xfff2d53b, 0x0006b508, 0, 0}},
  {{0xfff33b60, 0x00068100, 0, 0}}, {{0xfff3a185, 0x00064cf8, 0, 0}},
  {{0xfff407aa, 0x000618f0, 0, 0}}, {{0xfff46dcf, 0x0005e4e8, 0, 0}},
  {{0xfff4d3f4, 0x0005b0e0, 0, 0}}, {{0xfff53a19, 0x00057cd8, 0, 0}},
  {{0xfff5a03e, 0x000548d0, 0, 0}}, {{0xfff60663, 0x000514c8, 0, 0}},
  {{0xfff66c88, 0x0004e0c0, 0, 0}}, {{0xfff6d2ad, 0x0004acb8, 0, 0}},
  {{0xfff738d2, 0x000478b0, 0, 0}}, {{0xfff79ef7, 0x000444a8, 0, 0}},
  {{0xfff8051c, 0x000410a0, 0, 0}}, {{0xfff86b41, 0x0003dc98, 0, 0}},
  {{0xfff8d166, 0x0003a890, 0, 0}}, {{0xfff9378b, 0x00037488, 0, 0}},
  {{0xfff99db0, 0x00034080, 0, 0}}, {{0xfffa03d5, 0x00030c78, 0, 0}},
  {{0xfffa69fa, 0x0002d870, 0, 0}}, {{0xfffad01f, 0x0002a468, 0, 0}},
  {{0xfffb3644, 0x00027060, 0, 0}}, {{0xfffb9c69, 0x00023c58, 0, 0}},
  {{0xfffc028e, 0x00020850, 0, 0}}, {{0xfffc68b3, 0x0001d448, 0, 0}},
  {{0xfffcced8, 0x0001a040, 0, 0}}, {{0xfffd34fd, 0x00016c38, 0, 0}},
  {{0xfffd9b22, 0x00013830, 0, 0}}, {{0xfffe0147, 0x00010428, 0, 0}},
  {{0xfffe676c, 0x0000d020, 0, 0}}, {{0xfffecd91, 0x00009c18, 0, 0}},
  {{0xffff33b6, 0x00006810, 0, 0}}, {{0xffff99db, 0x00003408, 0, 0}},
  {{0x00000000, 0x00000000, 0, 0}}, {{0x00006625, 0xffffcbf8, 0, 0}},
  {{0x0000cc4a, 0xffff97f0, 0, 0}}, {{0x0001326f, 0xffff63e8, 0, 0}},
  {{0x00019894, 0xffff2fe0, 0, 0}}, {{0x0001feb9, 0xfffefbd8, 0, 0}},
  {{0x000264de, 0xfffec7d0, 0, 0}}, {{0x0002cb03, 0xfffe93c8, 0, 0}},
  {{0x00033128, 0xfffe5fc0, 0, 0}}, {{0x0003974d, 0xfffe2bb8, 0, 0}},
  {{0x0003fd72, 0xfffdf7b0, 0, 0}}, {{0x00046397, 0xfffdc3a8, 0, 0}},
  {{0x0004c9bc, 0xfffd8fa0, 0, 0}}, {{0x00052fe1, 0xfffd5b98, 0, 0}},
  {{0x00059606, 0xfffd2790, 0, 0}}, {{0x0005fc2b, 0xfffcf388, 0, 0}},
  {{0x00066250, 0xfffcbf80, 0, 0}}, {{0x0006c875, 0xfffc8b78, 0, 0}},
  {{0x00072e9a, 0xfffc5770, 0, 0}}, {{0x000794bf, 0xfffc2368, 0, 0}},
  {{0x0007fae4, 0xfffbef60, 0, 0}}, {{0x00086109, 0xfffbbb58, 0, 0}},
  {{0x0008c72e, 0xfffb8750, 0, 0}}, {{0x00092d53, 0xfffb5348, 0, 0}},
  {{0x00099378, 0xfffb1f40, 0, 0}}, {{0x0009f99d, 0xfffaeb38, 0, 0}},
  {{0x000a5fc2, 0xfffab730, 0, 0}}, {{0x000ac5e7, 0xfffa8328, 0, 0}},
  {{0x000b2c0c, 0xfffa4f20, 0, 0}}, {{0x000b9231, 0xfffa1b18, 0, 0}},
  {{0x000bf856, 0xfff9e710, 0, 0}}, {{0x000c5e7b, 0xfff9b308, 0, 0}},
  {{0x000cc4a0, 0xfff97f00, 0, 0}}, {{0x000d2ac5, 0xfff94af8, 0, 0}},
  {{0x000d90ea, 0xfff916f0, 0, 0}}, {{0x000df70f, 0xfff8e2e8, 0, 0}},
  {{0x000e5d34, 0xfff8aee0, 0, 0}}, {{0x000ec359, 0xfff87ad8, 0, 0}},
  {{0x000f297e, 0xfff846d0, 0, 0}}, {{0x000f8fa3, 0xfff812c8, 0, 0}},
  {{0x000ff5c8, 0xfff7dec0, 0, 0}}, {{0x00105bed, 0xfff7aab8, 0, 0}},
  {{0x0010c212, 0xfff776b0, 0, 0}}, {{0x00112837, 0xfff742a8, 0, 0}},
  {{0x00118e5c, 0xfff70ea0, 0, 0}}, {{0x0011f481, 0xfff6da98, 0, 0}},
  {{0x00125aa6, 0xfff6a690, 0, 0}}, {{0x0012c0cb, 0xfff67288, 0, 0}},
  {{0x001326f0, 0xfff63e80, 0, 0}}, {{0x00138d15, 0xfff60a78, 0, 0}},
  {{0x0013f33a, 0xfff5d670, 0, 0}}, {{0x0014595f, 0xfff5a268, 0, 0}},
  {{0x0014bf84, 0xfff56e60, 0, 0}}, {{0x001525a9, 0xfff53a58, 0, 0}},
  {{0x00158bce, 0xfff50650, 0, 0}}, {{0x0015f1f3, 0xfff4d248, 0, 0}},
  {{0x00165818, 0xfff49e40, 0, 0}}, {{0x0016be3d, 0xfff46a38, 0, 0}},
  {{0x00172462, 0xfff43630, 0, 0}}, {{0x00178a87, 0xfff40228, 0, 0}},
  {{0x0017f0ac, 0xfff3ce20, 0, 0}}, {{0x001856d1, 0xfff39a18, 0, 0}},
  {{0x0018bcf6, 0xfff36610, 0, 0}}, {{0x0019231b, 0xfff33208, 0, 0}},
  {{0x00198940, 0xfff2fe00, 0, 0}}, {{0x0019ef65, 0xfff2c9f8, 0, 0}},
  {{0x001a558a, 0xfff295f0, 0, 0}}, {{0x001abbaf, 0xfff261e8, 0, 0}},
  {{0x001b21d4, 0xfff22de0, 0, 0}}, {{0x001b87f9, 0xfff1f9d8, 0, 0}},
  {{0x001bee1e, 0xfff1c5d0, 0, 0}}, {{0x001c5443, 0xfff191c8, 0, 0}},
  {{0x001cba68, 0xfff15dc0, 0, 0}}, {{0x001d208d, 0xfff129b8, 0, 0}},
  {{0x001d86b2, 0xfff0f5b0, 0, 0}}, {{0x001decd7, 0xfff0c1a8, 0, 0}},
  {{0x001e52fc, 0xfff08da0, 0, 0}}, {{0x001eb921, 0xfff05998, 0, 0}},
  {{0x001f1f46, 0xfff02590, 0, 0}}, {{0x001f856b, 0xffeff188, 0, 0}},
  {{0x001feb90, 0xffefbd80, 0, 0}}, {{0x002051b5, 0xffef8978, 0, 0}},
  {{0x0020b7da, 0xffef5570, 0, 0}}, {{0x00211dff, 0xffef2168, 0, 0}},
  {{0x00218424, 0xffeeed60, 0, 0}}, {{0x0021ea49, 0xffeeb958, 0, 0}},
  {{0x0022506e, 0xffee8550, 0, 0}}, {{0x0022b693, 0xffee5148, 0, 0}},
  {{0x00231cb8, 0xffee1d40, 0, 0}}, {{0x002382dd, 0xffede938, 0, 0}},
  {{0x0023e902, 0xffedb530, 0, 0}}, {{0x00244f27, 0xffed8128, 0, 0}},
  {{0x0024b54c, 0xffed4d20, 0, 0}}, {{0x00251b71, 0xffed1918, 0, 0}},
  {{0x00258196, 0xffece510, 0, 0}}, {{0x0025e7bb, 0xffecb108, 0, 0}},
  {{0x00264de0, 0xffec7d00, 0, 0}}, {{0x0026b405, 0xffec48f8, 0, 0}},
  {{0x00271a2a, 0xffec14f0, 0, 0}}, {{0x0027804f, 0xffebe0e8, 0, 0}},
  {{0x0027e674, 0xffebace0, 0, 0}}, {{0x00284c99, 0xffeb78d8, 0, 0}},
  {{0x0028b2be, 0xffeb44d0, 0, 0}}, {{0x002918e3, 0xffeb10c8, 0, 0}},
  {{0x00297f08, 0xffeadcc0, 0, 0}}, {{0x0029e52d, 0xffeaa8b8, 0, 0}},
  {{0x002a4b52, 0xffea74b0, 0, 0}}, {{0x002ab177, 0xffea40a8, 0, 0}},
  {{0x002b179c, 0xffea0ca0, 0, 0}}, {{0x002b7dc1, 0xffe9d898, 0, 0}},
  {{0x002be3e6, 0xffe9a490, 0, 0}}, {{0x002c4a0b, 0xffe97088, 0, 0}},
  {{0x002cb030, 0xffe93c80, 0, 0}}, {{0x002d1655, 0xffe90878, 0, 0}},
  {{0x002d7c7a, 0xffe8d470, 0, 0}}, {{0x002de29f, 0xffe8a068, 0, 0}},
  {{0x002e48c4, 0xffe86c60, 0, 0}}, {{0x002eaee9, 0xffe83858, 0, 0}},
  {{0x002f150e, 0xffe80450, 0, 0}}, {{0x002f7b33, 0xffe7d048, 0, 0}},
  {{0x002fe158, 0xffe79c40, 0, 0}}, {{0x0030477d, 0xffe76838, 0, 0}},
  {{0x0030ada2, 0xffe73430, 0, 0}}, {{0x003113c7, 0xffe70028, 0, 0}},
  {{0x003179ec, 0xffe6cc20, 0, 0}}, {{0x0031e011, 0xffe69818, 0, 0}},
  {{0x00324636, 0xffe66410, 0, 0}}, {{0x0032ac5b, 0xffe63008, 0, 0}}
 };
--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@ -1,4 +1,3 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
 noinst_LTLIBRARIES = libwebpencode.la
 libwebpencode_la_SOURCES =
@ -12,8 +11,11 @@ libwebpencode_la_SOURCES += filter.c
 libwebpencode_la_SOURCES += frame.c
 libwebpencode_la_SOURCES += histogram.c
 libwebpencode_la_SOURCES += iterator.c
 libwebpencode_la_SOURCES += layer.c
 libwebpencode_la_SOURCES += picture.c
 libwebpencode_la_SOURCES += picture_csp.c
 libwebpencode_la_SOURCES += picture_psnr.c
 libwebpencode_la_SOURCES += picture_rescale.c
 libwebpencode_la_SOURCES += picture_tools.c
 libwebpencode_la_SOURCES += quant.c
 libwebpencode_la_SOURCES += syntax.c
 libwebpencode_la_SOURCES += token.c
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane compression.
@ -15,12 +17,9 @@
 #include "./vp8enci.h"
 #include "../utils/filters.h"
 #include "../utils/quant_levels.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
 // The pre-processing (quantization) is performed if 'quality' is less than 100.
@ -36,7 +35,7 @@ extern "C" {
 //
 // 'output' corresponds to the buffer containing compressed alpha data.
 //          This buffer is allocated by this method and caller should call
-//          free(*output) when done.
+//          WebPSafeFree(*output) when done.
 // 'output_size' corresponds to size of this compressed alpha buffer.
 //
 // Returns 1 on successfully encoding the alpha and
@ -48,12 +47,11 @@ extern "C" {
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                          int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw,
+                          VP8LBitWriter* const bw,
                          WebPAuxStats* const stats) {
  int ok = 0;
  WebPConfig config;
  WebPPicture picture;
  VP8LBitWriter tmp_bw;
  WebPPictureInit(&picture);
  picture.width = width;
@ -69,7 +67,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
    const uint8_t* src = data;
    for (j = 0; j < picture.height; ++j) {
      for (i = 0; i < picture.width; ++i) {
-        dst[i] = (src[i] << 8) | 0xff000000u;
+        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
      }
      src += width;
      dst += picture.argb_stride;
@ -79,36 +77,46 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
  WebPConfigInit(&config);
  config.lossless = 1;
  config.method = effort_level;  // impact is very small
-  // Set a moderate default quality setting for alpha.
+  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
-  config.quality = 5.f * effort_level;
+  // lower methods (3 and below) is less than the threshold for triggering
  // costly 'BackwardReferencesTraceBackwards'.
  config.quality = 8.f * effort_level;
  assert(config.quality >= 0 && config.quality <= 100.f);
-  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
+  ok = (VP8LEncodeStream(&config, &picture, bw) == VP8_ENC_OK);
  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
  WebPPictureFree(&picture);
-  if (ok) {
+  ok = ok && !bw->error_;
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
+  if (!ok) {
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
+    VP8LBitWriterDestroy(bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+    return 0;
  }
-  VP8LBitWriterDestroy(&tmp_bw);
+  return 1;
-  return ok && !bw->error_;
+
 }
 // -----------------------------------------------------------------------------
 // Small struct to hold the result of a filter mode compression attempt.
 typedef struct {
  size_t score;
  VP8BitWriter bw;
  WebPAuxStats stats;
 } FilterTrial;
 // This function always returns an initialized 'bw' object, even upon error.
 static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                               int method, int filter, int reduce_levels,
                               int effort_level,  // in [0..6] range
                               uint8_t* const tmp_alpha,
-                               VP8BitWriter* const bw,
+                               FilterTrial* result) {
                               WebPAuxStats* const stats) {
  int ok = 0;
  const uint8_t* alpha_src;
  WebPFilterFunc filter_func;
  uint8_t header;
  size_t expected_size;
  const size_t data_size = width * height;
  const uint8_t* output = NULL;
  size_t output_size = 0;
  VP8LBitWriter tmp_bw;
  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
@ -117,15 +125,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
  assert(sizeof(header) == ALPHA_HEADER_LEN);
  // TODO(skal): have a common function and #define's to validate alpha params.
  expected_size =
      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
                                       : (data_size >> 5);
  header = method | (filter << 2);
  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
  VP8BitWriterInit(bw, expected_size);
  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
  filter_func = WebPFilters[filter];
  if (filter_func != NULL) {
    filter_func(data, width, height, width, tmp_alpha);
@ -134,13 +133,43 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
    alpha_src = data;
  }
-  if (method == ALPHA_NO_COMPRESSION) {
+  if (method != ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
-    ok = ok && !bw->error_;
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
-  } else {
+                              &tmp_bw, &result->stats);
-    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
+    if (ok) {
-    VP8BitWriterFinish(bw);
+      output = VP8LBitWriterFinish(&tmp_bw);
      output_size = VP8LBitWriterNumBytes(&tmp_bw);
      if (output_size > data_size) {
        // compressed size is larger than source! Revert to uncompressed mode.
        method = ALPHA_NO_COMPRESSION;
        VP8LBitWriterDestroy(&tmp_bw);
      }
    } else {
      VP8LBitWriterDestroy(&tmp_bw);
      return 0;
    }
  }
  if (method == ALPHA_NO_COMPRESSION) {
    output = alpha_src;
    output_size = data_size;
    ok = 1;
  }
  // Emit final result.
  header = method | (filter << 2);
  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
  if (method != ALPHA_NO_COMPRESSION) {
    VP8LBitWriterDestroy(&tmp_bw);
  }
  ok = ok && !result->bw.error_;
  result->score = VP8BitWriterSize(&result->bw);
  return ok;
 }
@ -156,6 +185,104 @@ static void CopyPlane(const uint8_t* src, int src_stride,
  }
 }
 static int GetNumColors(const uint8_t* data, int width, int height,
                        int stride) {
  int j;
  int colors = 0;
  uint8_t color[256] = { 0 };
  for (j = 0; j < height; ++j) {
    int i;
    const uint8_t* const p = data + j * stride;
    for (i = 0; i < width; ++i) {
      color[p[i]] = 1;
    }
  }
  for (j = 0; j < 256; ++j) {
    if (color[j] > 0) ++colors;
  }
  return colors;
 }
 #define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
 #define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
 // Given the input 'filter' option, return an OR'd bit-set of filters to try.
 static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
                             int filter, int effort_level) {
  uint32_t bit_map = 0U;
  if (filter == WEBP_FILTER_FAST) {
    // Quick estimate of the best candidate.
    int try_filter_none = (effort_level > 3);
    const int kMinColorsForFilterNone = 16;
    const int kMaxColorsForFilterNone = 192;
    const int num_colors = GetNumColors(alpha, width, height, width);
    // For low number of colors, NONE yields better compression.
    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
             EstimateBestFilter(alpha, width, height, width);
    bit_map |= 1 << filter;
    // For large number of colors, try FILTER_NONE in addition to the best
    // filter as well.
    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
      bit_map |= FILTER_TRY_NONE;
    }
  } else if (filter == WEBP_FILTER_NONE) {
    bit_map = FILTER_TRY_NONE;
  } else {  // WEBP_FILTER_BEST -> try all
    bit_map = FILTER_TRY_ALL;
  }
  return bit_map;
 }
 static void InitFilterTrial(FilterTrial* const score) {
  score->score = (size_t)~0U;
  VP8BitWriterInit(&score->bw, 0);
 }
 static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
                                 size_t data_size, int method, int filter,
                                 int reduce_levels, int effort_level,
                                 uint8_t** const output,
                                 size_t* const output_size,
                                 WebPAuxStats* const stats) {
  int ok = 1;
  FilterTrial best;
  uint32_t try_map =
      GetFilterMap(alpha, width, height, filter, effort_level);
  InitFilterTrial(&best);
  if (try_map != FILTER_TRY_NONE) {
    uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
    if (filtered_alpha == NULL) return 0;
    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
      if (try_map & 1) {
        FilterTrial trial;
        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
                                 reduce_levels, effort_level, filtered_alpha,
                                 &trial);
        if (ok && trial.score < best.score) {
          VP8BitWriterWipeOut(&best.bw);
          best = trial;
        } else {
          VP8BitWriterWipeOut(&trial.bw);
        }
      }
    }
    WebPSafeFree(filtered_alpha);
  } else {
    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
                             reduce_levels, effort_level, NULL, &best);
  }
  if (ok) {
    if (stats != NULL) *stats = best.stats;
    *output_size = VP8BitWriterSize(&best.bw);
    *output = VP8BitWriterBuf(&best.bw);
  } else {
    VP8BitWriterWipeOut(&best.bw);
  }
  return ok;
 }
 static int EncodeAlpha(VP8Encoder* const enc,
                       int quality, int method, int filter,
                       int effort_level,
@ -186,7 +313,12 @@ static int EncodeAlpha(VP8Encoder* const enc,
    return 0;
  }
-  quant_alpha = (uint8_t*)malloc(data_size);
+  if (method == ALPHA_NO_COMPRESSION) {
    // Don't filter, as filtering will make no impact on compressed size.
    filter = WEBP_FILTER_NONE;
  }
  quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
  if (quant_alpha == NULL) {
    return 0;
  }
@ -204,85 +336,19 @@ static int EncodeAlpha(VP8Encoder* const enc,
  }
  if (ok) {
-    VP8BitWriter bw;
+    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
-    int test_filter;
+                               filter, reduce_levels, effort_level, output,
-    uint8_t* filtered_alpha = NULL;
+                               output_size, pic->stats);
    // We always test WEBP_FILTER_NONE first.
    ok = EncodeAlphaInternal(quant_alpha, width, height,
                             method, WEBP_FILTER_NONE, reduce_levels,
                             effort_level, NULL, &bw, pic->stats);
    if (!ok) {
      VP8BitWriterWipeOut(&bw);
      goto End;
    }
    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
      filter = EstimateBestFilter(quant_alpha, width, height, width);
    }
    // Stop?
    if (filter == WEBP_FILTER_NONE) {
      goto Ok;
    }
    filtered_alpha = (uint8_t*)malloc(data_size);
    ok = (filtered_alpha != NULL);
    if (!ok) {
      goto End;
    }
    // Try the other mode(s).
    {
      WebPAuxStats best_stats;
      size_t best_score = VP8BitWriterSize(&bw);
      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
      if (pic->stats != NULL) best_stats = *pic->stats;
      for (test_filter = WEBP_FILTER_HORIZONTAL;
           ok && (test_filter <= WEBP_FILTER_GRADIENT);
           ++test_filter) {
        VP8BitWriter tmp_bw;
        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
          continue;
        }
        ok = EncodeAlphaInternal(quant_alpha, width, height,
                                 method, test_filter, reduce_levels,
                                 effort_level, filtered_alpha, &tmp_bw,
                                 pic->stats);
        if (ok) {
          const size_t score = VP8BitWriterSize(&tmp_bw);
          if (score < best_score) {
            // swap bitwriter objects.
            VP8BitWriter tmp = tmp_bw;
            tmp_bw = bw;
            bw = tmp;
            best_score = score;
            if (pic->stats != NULL) best_stats = *pic->stats;
          }
        } else {
          VP8BitWriterWipeOut(&bw);
        }
        VP8BitWriterWipeOut(&tmp_bw);
      }
      if (pic->stats != NULL) *pic->stats = best_stats;
    }
 Ok:
    if (ok) {
      *output_size = VP8BitWriterSize(&bw);
      *output = VP8BitWriterBuf(&bw);
    if (pic->stats != NULL) {  // need stats?
      pic->stats->coded_size += (int)(*output_size);
      enc->sse_[3] = sse;
    }
  }
-    free(filtered_alpha);
+
-  }
+  WebPSafeFree(quant_alpha);
 End:
  free(quant_alpha);
  return ok;
 }
 //------------------------------------------------------------------------------
 // Main calls
@ -300,7 +366,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
    return 0;
  }
  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
-    free(alpha_data);
+    WebPSafeFree(alpha_data);
    return 0;
  }
  enc->alpha_data_size_ = (uint32_t)alpha_size;
@ -315,7 +381,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
  enc->alpha_data_size_ = 0;
  if (enc->thread_level_ > 0) {
    WebPWorker* const worker = &enc->alpha_worker_;
-    WebPWorkerInit(worker);
+    WebPGetWorkerInterface()->Init(worker);
    worker->data1 = enc;
    worker->data2 = NULL;
    worker->hook = (WebPWorkerHook)CompressAlphaJob;
@ -326,10 +392,11 @@ int VP8EncStartAlpha(VP8Encoder* const enc) {
  if (enc->has_alpha_) {
    if (enc->thread_level_ > 0) {
      WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
+      // Makes sure worker is good to go.
      if (!WebPGetWorkerInterface()->Reset(worker)) {
        return 0;
      }
-      WebPWorkerLaunch(worker);
+      WebPGetWorkerInterface()->Launch(worker);
      return 1;
    } else {
      return CompressAlphaJob(enc, NULL);   // just do the job right away
@ -342,7 +409,7 @@ int VP8EncFinishAlpha(VP8Encoder* const enc) {
  if (enc->has_alpha_) {
    if (enc->thread_level_ > 0) {
      WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerSync(worker)) return 0;  // error
+      if (!WebPGetWorkerInterface()->Sync(worker)) return 0;  // error
    }
  }
  return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
@ -352,16 +419,15 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
  int ok = 1;
  if (enc->thread_level_ > 0) {
    WebPWorker* const worker = &enc->alpha_worker_;
-    ok = WebPWorkerSync(worker);  // finish anything left in flight
+    // finish anything left in flight
-    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
+    ok = WebPGetWorkerInterface()->Sync(worker);
    // still need to end the worker, even if !ok
    WebPGetWorkerInterface()->End(worker);
  }
-  free(enc->alpha_data_);
+  WebPSafeFree(enc->alpha_data_);
  enc->alpha_data_ = NULL;
  enc->alpha_data_size_ = 0;
  enc->has_alpha_ = 0;
  return ok;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
+// Use of this source code is governed by a BSD-style license
-//  Software License Agreement:  http://www.webmproject.org/license/software/
+// that can be found in the COPYING file in the root of the source
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Macroblock analysis
@ -17,10 +19,6 @@
 #include "./cost.h"
 #include "../utils/utils.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #define MAX_ITERS_K_MEANS  6
 //------------------------------------------------------------------------------
@ -32,7 +30,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
  const int w = enc->mb_w_;
  const int h = enc->mb_h_;
  const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
  if (tmp == NULL) return;
@ -53,6 +51,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
      for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
        if (cnt[n] >= majority_cnt_3_x_3_grid) {
          majority_seg = n;
          break;
        }
      }
      tmp[x + y * w] = majority_seg;
@ -64,7 +63,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
      mb->segment_ = tmp[x + y * w];
    }
  }
-  free(tmp);
+  WebPSafeFree(tmp);
 }
 //------------------------------------------------------------------------------
@ -142,7 +141,11 @@ static void MergeHistograms(const VP8Histogram* const in,
 static void AssignSegments(VP8Encoder* const enc,
                           const int alphas[MAX_ALPHA + 1]) {
-  const int nb = enc->segment_hdr_.num_segments_;
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
  int centers[NUM_MB_SEGMENTS];
  int weighted_average = 0;
  int map[MAX_ALPHA + 1];
@ -151,6 +154,9 @@ static void AssignSegments(VP8Encoder* const enc,
  // 'int' type is ok for histo, and won't overflow
  int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
  assert(nb >= 1);
  assert(nb <= NUM_MB_SEGMENTS);
  // bracket the input
  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
  min_a = n;
@ -159,8 +165,9 @@ static void AssignSegments(VP8Encoder* const enc,
  range_a = max_a - min_a;
  // Spread initial centers evenly
-  for (n = 1, k = 0; n < 2 * nb; n += 2) {
+  for (k = 0, n = 1; k < nb; ++k, n += 2) {
-    centers[k++] = min_a + (n * range_a) / (2 * nb);
+    assert(n < 2 * nb);
    centers[k] = min_a + (n * range_a) / (2 * nb);
  }
  for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
@ -175,7 +182,7 @@ static void AssignSegments(VP8Encoder* const enc,
    n = 0;    // track the nearest center for current 'a'
    for (a = min_a; a <= max_a; ++a) {
      if (alphas[a]) {
-        while (n < nb - 1 && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
          n++;
        }
        map[a] = n;
@ -223,18 +230,15 @@ static void AssignSegments(VP8Encoder* const enc,
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
-// Number of modes to inspect for alpha_ evaluation. For high-quality settings
+// Number of modes to inspect for alpha_ evaluation. We don't need to test all
-// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
+// the possible modes during the analysis phase: we risk falling into a local
-// during the analysis phase.
+// optimum, or be subject to boundary effect
 #define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2
 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode =
+  const int max_mode = MAX_INTRA16_MODE;
      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
                                                  : NUM_PRED_MODES;
  int mode;
  int best_alpha = DEFAULT_ALPHA;
  int best_mode = 0;
@ -260,9 +264,7 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                   int best_alpha) {
  uint8_t modes[16];
-  const int max_mode =
+  const int max_mode = MAX_INTRA4_MODE;
      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
                                                  : NUM_BMODES;
  int i4_alpha;
  VP8Histogram total_histo = { { 0 } };
  int cur_histo = 0;
@ -304,10 +306,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
  int best_alpha = DEFAULT_ALPHA;
  int best_mode = 0;
-  const int max_mode =
+  const int max_mode = MAX_UV_MODE;
      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
                                                  : NUM_PRED_MODES;
  int mode;
  VP8MakeChroma8Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
    VP8Histogram histo = { { 0 } };
@ -382,38 +383,116 @@ static void ResetAllMBInfo(VP8Encoder* const enc) {
  // Default susceptibilities.
  enc->dqm_[0].alpha_ = 0;
  enc->dqm_[0].beta_ = 0;
-  // Note: we can't compute this alpha_ / uv_alpha_.
+  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
  enc->alpha_ = 0;
  enc->uv_alpha_ = 0;
  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 // struct used to collect job result
 typedef struct {
  WebPWorker worker;
  int alphas[MAX_ALPHA + 1];
  int alpha, uv_alpha;
  VP8EncIterator it;
  int delta_progress;
 } SegmentJob;
 // main work call
 static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
  int ok = 1;
  if (!VP8IteratorIsDone(it)) {
    uint8_t tmp[32 + ALIGN_CST];
    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
    do {
      // Let's pretend we have perfect lossless reconstruction.
      VP8IteratorImport(it, scratch);
      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
      ok = VP8IteratorProgress(it, job->delta_progress);
    } while (ok && VP8IteratorNext(it));
  }
  return ok;
 }
 static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
  int i;
  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
  dst->alpha += src->alpha;
  dst->uv_alpha += src->uv_alpha;
 }
 // initialize the job struct with some TODOs
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
                           int start_row, int end_row) {
  WebPGetWorkerInterface()->Init(&job->worker);
  job->worker.data1 = job;
  job->worker.data2 = &job->it;
  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
  VP8IteratorInit(enc, &job->it);
  VP8IteratorSetRow(&job->it, start_row);
  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
  memset(job->alphas, 0, sizeof(job->alphas));
  job->alpha = 0;
  job->uv_alpha = 0;
  // only one of both jobs can record the progress, since we don't
  // expect the user's hook to be multi-thread safe
  job->delta_progress = (start_row == 0) ? 20 : 0;
 }
 // main entry point
 int VP8EncAnalyze(VP8Encoder* const enc) {
  int ok = 1;
  const int do_segments =
      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
      (enc->segment_hdr_.num_segments_ > 1) ||
      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
  enc->alpha_ = 0;
  enc->uv_alpha_ = 0;
  if (do_segments) {
-    int alphas[MAX_ALPHA + 1] = { 0 };
+    const int last_row = enc->mb_h_;
-    VP8EncIterator it;
+    // We give a little more than a half work to the main thread.
-
+    const int split_row = (9 * last_row + 15) >> 4;
-    VP8IteratorInit(enc, &it);
+    const int total_mb = last_row * enc->mb_w_;
-    do {
+#ifdef WEBP_USE_THREAD
-      VP8IteratorImport(&it);
+    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
-      MBAnalyze(&it, alphas, &enc->alpha_, &enc->uv_alpha_);
+    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
-      ok = VP8IteratorProgress(&it, 20);
+#else
-      // Let's pretend we have perfect lossless reconstruction.
+    const int do_mt = 0;
-    } while (ok && VP8IteratorNext(&it, it.yuv_in_));
+#endif
-    enc->alpha_ /= enc->mb_w_ * enc->mb_h_;
+    const WebPWorkerInterface* const worker_interface =
-    enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
+        WebPGetWorkerInterface();
-    if (ok) AssignSegments(enc, alphas);
+    SegmentJob main_job;
    if (do_mt) {
      SegmentJob side_job;
      // Note the use of '&' instead of '&&' because we must call the functions
      // no matter what.
      InitSegmentJob(enc, &main_job, 0, split_row);
      InitSegmentJob(enc, &side_job, split_row, last_row);
      // we don't need to call Reset() on main_job.worker, since we're calling
      // WebPWorkerExecute() on it
      ok &= worker_interface->Reset(&side_job.worker);
      // launch the two jobs in parallel
      if (ok) {
        worker_interface->Launch(&side_job.worker);
        worker_interface->Execute(&main_job.worker);
        ok &= worker_interface->Sync(&side_job.worker);
        ok &= worker_interface->Sync(&main_job.worker);
      }
      worker_interface->End(&side_job.worker);
      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
    } else {
      // Even for single-thread case, we use the generic Worker tools.
      InitSegmentJob(enc, &main_job, 0, last_row);
      worker_interface->Execute(&main_job.worker);
      ok &= worker_interface->Sync(&main_job.worker);
    }
    worker_interface->End(&main_job.worker);
    if (ok) {
      enc->alpha_ = main_job.alpha / total_mb;
      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
      AssignSegments(enc, main_job.alphas);
    }
  } else {   // Use only one default segment.
    ResetAllMBInfo(enc);
  }
  return ok;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/Show More
+++ b/Show More