update ChangeLog

Change-Id: I1df7a610c466eb1cfc675e030c12fcff21f57c1b
faster rgb565/rgb4444/argb output
2025-07-15 05:19:48 +02:00 · 2015-12-18 00:04:24 -08:00 · 2015-12-17 23:38:58 -08:00 · 2015-12-17 23:25:23 -08:00 · 2015-12-17 19:45:14 -08:00 · 2015-12-17 19:45:14 -08:00
182 changed files with 24767 additions and 9753 deletions
--- a/.gitignore
+++ b/.gitignore
@ -19,6 +19,7 @@
 /stamp-h1
 Makefile
 Makefile.in
+examples/anim_diff
 examples/[cdv]webp
 examples/gif2webp
 examples/webpmux
--- a/.mailmap
+++ b/.mailmap
@ -6,3 +6,5 @@ Vikas Arora <vikasa@google.com>
 <vikasa@google.com> <vikasa@gmail.com>
 <vikasa@google.com> <vikaas.arora@gmail.com>
 <slobodan.prijic@imgtec.com> <Slobodan.Prijic@imgtec.com>
+<vrabaud@google.com> <vincent.rabaud@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
--- a/9
+++ b/9
@ -7,19 +7,26 @@ Contributors:
 - Johann (johann dot koenig at duck dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
- levytamar82 (tamar dot levy at intel dot com)
+- Lode Vandevenne (lode at google dot com)
 - Lou Quillio (louquillio at google dot com)
 - Mans Rullgard (mans at mansr dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
+- Mislav Bradac (mislavm at google dot com)
 - Noel Chromium (noel at chromium dot org)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Sam Clegg (sbc at chromium dot org)
+- Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
 - Scott Talbot (s at chikachow dot org)
 - Slobodan Prijic (slobodan dot prijic at imgtec dot com)
 - Somnath Banerjee (somnath dot banerjee at gmail dot com)
+- Sriraman Tallam (tmsriram at google dot com)
+- Tamar Levy (tamar dot levy at intel dot com)
 - Timothy Gu (timothygu99 at gmail dot com)
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
+- Vincent Rabaud (vrabaud at google dot com)
+- Yang Zhang (yang dot zhang at arm dot com)
--- a/Android.mk
+++ b/Android.mk
@ -15,6 +15,7 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  NEON := c.neon
+  USE_CPUFEATURES := yes
 else
  NEON := c
 endif
@ -32,34 +33,64 @@ dec_srcs := \
    src/dec/webp.c \

 demux_srcs := \
+    src/demux/anim_decode.c \
    src/demux/demux.c \

 dsp_dec_srcs := \
    src/dsp/alpha_processing.c \
+    src/dsp/alpha_processing_mips_dsp_r2.c \
    src/dsp/alpha_processing_sse2.c \
+    src/dsp/alpha_processing_sse41.c \
+    src/dsp/argb.c \
+    src/dsp/argb_mips_dsp_r2.c \
+    src/dsp/argb_sse2.c \
    src/dsp/cpu.c \
    src/dsp/dec.c \
    src/dsp/dec_clip_tables.c \
    src/dsp/dec_mips32.c \
+    src/dsp/dec_mips_dsp_r2.c \
    src/dsp/dec_neon.$(NEON) \
    src/dsp/dec_sse2.c \
+    src/dsp/dec_sse41.c \
+    src/dsp/filters.c \
+    src/dsp/filters_mips_dsp_r2.c \
+    src/dsp/filters_sse2.c \
    src/dsp/lossless.c \
-    src/dsp/lossless_mips32.c \
+    src/dsp/lossless_mips_dsp_r2.c \
    src/dsp/lossless_neon.$(NEON) \
    src/dsp/lossless_sse2.c \
+    src/dsp/rescaler.c \
+    src/dsp/rescaler_mips32.c \
+    src/dsp/rescaler_mips_dsp_r2.c \
+    src/dsp/rescaler_neon.$(NEON) \
+    src/dsp/rescaler_sse2.c \
    src/dsp/upsampling.c \
+    src/dsp/upsampling_mips_dsp_r2.c \
    src/dsp/upsampling_neon.$(NEON) \
    src/dsp/upsampling_sse2.c \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
+    src/dsp/yuv_mips_dsp_r2.c \
    src/dsp/yuv_sse2.c \

 dsp_enc_srcs := \
+    src/dsp/cost.c \
+    src/dsp/cost_mips32.c \
+    src/dsp/cost_mips_dsp_r2.c \
+    src/dsp/cost_sse2.c \
    src/dsp/enc.c \
    src/dsp/enc_avx2.c \
    src/dsp/enc_mips32.c \
+    src/dsp/enc_mips_dsp_r2.c \
    src/dsp/enc_neon.$(NEON) \
    src/dsp/enc_sse2.c \
+    src/dsp/enc_sse41.c \
+    src/dsp/lossless_enc.c \
+    src/dsp/lossless_enc_mips32.c \
+    src/dsp/lossless_enc_mips_dsp_r2.c \
+    src/dsp/lossless_enc_neon.$(NEON) \
+    src/dsp/lossless_enc_sse2.c \
+    src/dsp/lossless_enc_sse41.c \

 enc_srcs := \
    src/enc/alpha.c \
@ -67,10 +98,12 @@ enc_srcs := \
    src/enc/backward_references.c \
    src/enc/config.c \
    src/enc/cost.c \
+    src/enc/delta_palettization.c \
    src/enc/filter.c \
    src/enc/frame.c \
    src/enc/histogram.c \
    src/enc/iterator.c \
+    src/enc/near_lossless.c \
    src/enc/picture.c \
    src/enc/picture_csp.c \
    src/enc/picture_psnr.c \
@ -84,6 +117,7 @@ enc_srcs := \
    src/enc/webpenc.c \

 mux_srcs := \
+    src/mux/anim_encode.c \
    src/mux/muxedit.c \
    src/mux/muxinternal.c \
    src/mux/muxread.c \
@ -120,7 +154,9 @@ LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
 # prefer arm over thumb mode for performance gains
 LOCAL_ARM_MODE := arm

-LOCAL_STATIC_LIBRARIES := cpufeatures
+ifeq ($(USE_CPUFEATURES),yes)
+  LOCAL_STATIC_LIBRARIES := cpufeatures
+endif

 LOCAL_MODULE := webpdecoder_static

@ -212,4 +248,6 @@ endif

 include $(LOCAL_PATH)/examples/Android.mk

-$(call import-module,android/cpufeatures)
+ifeq ($(USE_CPUFEATURES),yes)
+  $(call import-module,android/cpufeatures)
+endif
--- a/884
+++ b/884
@ -1,104 +1,788 @@
-a661e50 Disable NEON code on Native Client
-fcd94e9 update ChangeLog (tag: v0.4.3-rc1)
-569fe57 update NEWS
-bd852f5 bump version to 0.4.3
-2d58b64 WebPPictureRescale: add a note about 0 width/height
-a0d8ca5 examples/Android.mk: add webpmux_example target
-34b1d29 Android.mk: add webpmux target
-7561988 Android.mk: add webpdemux target
-a987576 Android.mk: add webpdecoder{,_static} targets
-a6d4859 Android.mk: split source lists per-directory
-77544d5 fix iOS arm64 build with Xcode 6.3
-6dea157 doc/webp-container-spec: note MSB order for chunk diagrams
-f7cd57b doc/webp-container-spec: cosmetics
-1d6b250 vwebp: clear canvas at the beginning of each loop
-f97b3f8 webp-container-spec: clarify background clear on loop
-4ba83c1 vwebp: remove unnecessary static Help() prototype
-d34e8e3 vwebp/animation: display last frame on end-of-loop
-bbbc524 dec/vp8: clear 'dither_' on skipped blocks
-0339fa2 lossless_neon: enable subtract green for aarch64
-5a0c220 Regression fix for lossless decoding
-6e3a31d wicdec: (msvs) quiet some /analyze warnings
-b49a578 dwebp/WritePNG: mark png variables volatile
-0a4391a dwebp: include setjmp.h w/WEBP_HAVE_PNG
-90f1ec5 dwebp: correct sign in format strings
-b61ce86 VP8LEncodeStream: add an assert
-df1081b dsp/cpu: (msvs) add include for __cpuidex
-39aa055 dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
-f814f42 dsp/cpu: add include for _xgetbv() w/MSVS
-8508ab9 cpu: fix AVX2 detection for gcc/clang targets
-5769623 fix handling of zero-sized partition #0 corner case
-b2e71a9 make the 'last_cpuinfo_used' variable names unique
-1273e84 add -Wformat-nonliteral and -Wformat-security
-3ae78eb multi-thread fix: lock each entry points with a static var
-5c1eeda webp-container-spec: remove references to fragments
-c5ceea4 enc_neon: fix building with non-Xcode clang (iOS)
-d0859d6 iosbuild: add x64_64 simulator support
-046732c WebPEncode: Support encoding same pic twice (even if modified)
-4426f50 webp/types.h: use inline for clang++/-std=c++11
-e297fc7 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
-855fe43 Makefile.vc: add a 'legacy' RTLIBCFG option
-b7eb6d5 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
-5691bdd gif2webp: Handle frames with odd offsets + disposal to background.
-8301da1 stopwatch.h: fix includes
-6a2209a update ChangeLog (tag: v0.4.2, origin/0.4.2, 0.4.2)
-36cad6a bit_reader.h: cosmetics: fix a typo
-e2ecae6 enc_mips32: workaround gcc-4.9 bug
-243e68d update ChangeLog (tag: v0.4.2-rc2)
-eec5f5f enc/vp8enci.h: update version number
-0c1b98d update NEWS
-69b0fc9 update AUTHORS
-857578a bump version to 0.4.2
-9129deb restore encode API compatibility
-f17b95e AssignSegments: quiet -Warray-bounds warning
-9c56c8a enc_neon: initialize vectors w/vdup_n_u32
-a008902 iosbuild: cleanup
-cc6de53 iosbuild: output autoconf req. on failure
-740d765 iosbuild: make iOS 6 the minimum requirement
-403023f iobuild.sh: only install .h files in Headers
-b65727b Premultiply with alpha during U/V downsampling
-8de0deb gif2webp: Background color correction
-f8b7d94 Amend the lossless spec according to issue #205, #206 and #224
-9102a7b Add a WebPExtractAlpha function to dsp
-e407b5d webpmux: simplify InitializeConfig()
-3e70e64 webpmux: fix indent
-be38f1a webpmux: fix exit status on numeric value parse error
-94dadcb webpmux: fix loop_count range check
-40b3a61 examples: warn on invalid numeric parameters
-b7d209a gif2webp: Handle frames with missing  graphic control extension
-bf0eb74 configure: simplify libpng-config invocation
-3740f7d Rectify bug in lossless incremental decoding.
-3ab0a37 make VP8LSetBitPos() set br->eos_ flag
-2e4312b Lossless decoding: fix eos_ flag condition
-e6609ac fix erroneous dec->status_ setting
-5692eae add a fallback to ALPHA_NO_COMPRESSION
-6ecd5bf ExUtilReadFromStdin: (windows) open stdin in bin mode
-4206ac6 webpmux: (windows) open stdout in binary mode
-d40e885 cwebp: (windows) open stdout in binary mode
-4aaf463 example_util: add ExUtilSetBinaryMode
-4c82ff7 webpmux man page: Clarify some title, descriptions and examples
-23d4fb3 dsp/lossless: workaround gcc-4.9 bug on arm
-5af7719 dsp.h: collect gcc/clang version test macros
-90d1124 enc_neon: enable QuantizeBlock for aarch64
-ee78e78 SmartRGBYUV: fix odd-width problem with pixel replication
-c9ac204 fix some MSVC64 warning about float conversion
-f4497a1 cpu: check for _MSC_VER before using msvc inline asm
-e2159fd faster RGB->YUV conversion function (~7% speedup)
-21abaa0 Add smart RGB->YUV conversion option -pre 4
-1a161e2 configure: add work around for gcc-4.9 aarch64 bug
-55b10de MIPS: mips32r2: added optimization for BSwap32
-76d2192 Update PATENTS to reflect s/VP8/WebM/g
-29a9db1 MIPS: detect mips32r6 and disable mips32r1 code
-245c4a6 Correctly use the AC_CANONICAL_* macros
-40aa8b6 cosmetics
-2ddcca5 cosmetics: remove some extraneous 'extern's
-f40dd7c vp8enci.h: cosmetics: fix '*' placement
-4610c9c bit_writer: cosmetics: rename kFlush() -> Flush()
-fc3c175 dsp: detect mips64 & disable mips32 code
-c1a7955 cwebp.1: restore quality description
-57a7e73 correct alpha_dithering_strength ABI check
-6c83157 correct WebPMemoryWriterClear ABI check
+7e7b6cc faster rgb565/rgb4444/argb output
+4c7f565 update NEWS
+1f62b6b update AUTHORS
+e224fdc update mailmap
+7110050 bump version to 0.5.0
+230a685 README: update help text, repo link
+d48e427 Merge "demux: accept raw bitstreams"
+99a01f4 Merge "Unify some entropy functions."
+4b025f1 Merge "configure: disable asserts by default"
+92cbddf Merge "fix PrintBlockInfo()"
+ca509a3 Unify some entropy functions.
+367bf90 fix PrintBlockInfo()
+b0547ff move back common constants for lossless_enc*.c into the .h
+fb4c783 lossless: simpler alpha cleanup preprocessing
+ba7f4b6 Merge "anim_diff: add brief description of options"
+47ddd5a Move some codec logic out of ./dsp .
+b4106c4 anim_diff: add brief description of options
+357f455 yuv_sse2: fix 32-bit visual studio build
+b9d80fa configure: disable asserts by default
+7badd3d cosmetic fix: sizeof(type) -> sizeof(*var)
+80ce27d Speed up 24-bit packing / unpacking in YUV / RGB conversions.
+68eebcb remove a TODO about rotation
+2dee296 remove few obsolete TODO about aligned loads in SSE2
+e0c0bb3 remove TODO about unused ref_lf_delta[]
+9cf1cc2 remove few TODO:   * 256 -> RD_DISTO_MULT   * don't use TDisto for UV mode picking
+7918964 Merge changes from topic 'demux-fragment-cleanup'
+47399f9 demux: remove GetFragment()
+d3cfb79 demux: remove dead fragment related TODO
+ab714b8 demux, Frame: remove is_fragment_ field
+b105921 yuv_sse2, cosmetics: fix indent
+466c92e demux,WebPIterator: remove fragment_num/num_fragments
+11714ff demux: remove WebPDemuxSelectFragment
+c0f7cc4 fix for bug #280: UMR in next->bits
+578beeb Merge "enc/Makefile.am: add missing headers"
+1a819f0 makefile.unix: make visibility=hidden the default
+d4f9c2e enc/Makefile.am: add missing headers
+846caff configure: check for -fvisibility=hidden
+3f3ea2c demux: accept raw bitstreams
+d6dad5d man cwebp: add precision about exactness of the 'lossless' mode
+46bb1e3 Merge "gifdec: remove utils.h include"
+2b882e9 Merge "Makefile.vc: define WEBP_HAVE_GIF for gifdec.c"
+892b923 Merge "man/*, AUTHORS: clarify origin of the tool"
+e5687a1 Merge "fix optimized build with -mcmodel=medium"
+e56e685 Makefile.vc: define WEBP_HAVE_GIF for gifdec.c
+4077d94 gifdec: remove utils.h include
+b5e30da man/*, AUTHORS: clarify origin of the tool
+b275e59 fix optimized build with -mcmodel=medium
+64da45a cosmetics, cwebp: fix indent
+038a060 Merge "add disto-based refinement for UV mode (if method = 1 or 2)"
+2835089 Provide an SSE2 implementation of CombinedShannonEntropy.
+e6c9351 add disto-based refinement for UV mode (if method = 1 or 2)
+04507dc Merge "fix undefined behaviour during shift, using a cast"
+793c526 Merge "wicdec: add support for reading from stdin"
+d3d1639 Optimize the heap usage in HistogramCombineGreedy.
+202a710 fix undefined behaviour during shift, using a cast
+14d27a4 improve method #2 by merging DistoRefine() and  SimpleQuantize()
+cb1ce99 Merge "10% faster table-less SSE2/NEON version of YUV->RGB conversion"
+ac761a3 10% faster table-less SSE2/NEON version of YUV->RGB conversion
+79fcf29 wicdec: add support for reading from stdin
+015f173 Merge "cwebp: add support for stdin input"
+a9947c3 cwebp: add support for stdin input
+7eb01ff Merge "Improved alpha cleanup for the webp encoder when prediction transform is used."
+fb8c910 Merge "introduce WebPMemToUint32 and WebPUint32ToMem for memory access"
+bd91af2 Merge "bit_reader: remove aarch64 BITS TODO"
+6c702b8 Speed up hash chain initialization using memset.
+4c60f63 make ReadPNG and ReadJPEG take a filename instead of a FILE
+464ed10 bit_reader: remove aarch64 BITS TODO
+d478e58 Merge "configure: update issue tracker"
+6938111 Improved alpha cleanup for the webp encoder when prediction transform is used.
+2c08aac introduce WebPMemToUint32 and WebPUint32ToMem for memory access
+010ca3d Fix FindMatchLength with non-aligned buffers.
+a90e1e3 README: add prerequisites for an autoconf build
+458f086 configure: update issue tracker
+3391459 vwebp: work around the transparent background with GLUT bug
+e4a7eed cosmetics: fix indent
+0837512 Merge "Make a separate case for low_effort in CopyImageWithPrediction"
+aa2eb2d Merge "cosmetics: fix indent"
+b7551e9 cosmetics: fix indent
+5bda52d Make a separate case for low_effort in CopyImageWithPrediction
+66fa598 Merge "configure: fix intrinsics build w/older gcc"
+5ae220b backward_references.c: Fixed compiler warning
+1556da0 Merge "configure: restore 2 warnings"
+71a17e5 configure: restore 2 warnings
+9eeabc0 configure: fix intrinsics build w/older gcc
+363babe Merge "fix some warning about unaligned 32b reads"
+a141178 Optimization in hash chain comparison for 64 bit Arrays were compared 32 bits at a time, it is now done 64 bits at a time. Overall encoding speed-up is only of 0.2% on @skal's small PNG corpus. It is of 3% on my initial 1.3 Mp desktop screenshot image.
+829bd14 Combine Huffman cost and bit entropy into one loop
+a7a954c Merge "lossless: make prediction in encoder work per scanline"
+61b605b Merge "fix of undefined multiply (int32 overflow)"
+239421c lossless: make prediction in encoder work per scanline
+f5ca40e fix of undefined multiply (int32 overflow)
+5cd2ef4 Merge changes from topic 'win-threading-compat'
+76ce918 Makefile.vc: enable WEBP_USE_THREAD for windows phone
+d2afe97 thread: use CreateThread for windows phone
+0fd0e12 thread: use WaitForSingleObjectEx if available
+63fadc9 thread: use InitializeCriticalSectionEx if available
+110ad58 thread: use native windows cond var if available
+912c9fd dec/webp: use GetLE(24|32) from utils
+f169448 utils/GetLE32: correct uint32 promotion
+158763d Merge "always call WebPInitSamplers(), don't try to be smart"
+3770f3b Merge "cleanup the YFIX/TFIX difference by removing some code and #define"
+a40f60a Merge "3% speed improvement for lossless webp encoder for low effort mode:"
+ed1c2bc always call WebPInitSamplers(), don't try to be smart
+b8c44f1 3% speed improvement for lossless webp encoder for low effort mode:
+997e103 cleanup the YFIX/TFIX difference by removing some code and #define
+d73d1c8 Merge "Make discarding invisible RGB values (cleanup alpha) the default."
+1f9be97 Make discarding invisible RGB values (cleanup alpha) the default.
+f240117 Make dwebp listen more to the -quiet flag
+b37b017 fix for issue #275: don't compare to out-of-bound pointers
+21735e0 speed-up trivial one-symbol decoding case for lossless
+397863b Refactor CopyPlane() and CopyPixels() methods: put them in utils.
+6ecd72f Re-enable encoding of alpha plane with color cache for next release.
+1f7148a Merge "remove unused fields from WebPDecoderOptions and WebPBitstreamFeatures"
+6ae395f Merge "use ExReadFile() for ReadYUV()"
+8076a00 gitignore list: add anim_diff.
+1c1702d use ExReadFile() for ReadYUV()
+775d3a3 remove unused fields from WebPDecoderOptions and WebPBitstreamFeatures
+c13245c AnimEncoder: Add a GetError() method.
+688b265 AnimDecoder API: Add a GetDemuxer() method.
+1aa4e3d WebPAnimDecoder: add an option to enable multi-threaded decoding.
+3584abc AnimDecoder: option to decode to common color modes.
+afd5a62 Merge "mux.h does NOT need to include encode.h"
+8550d44 Merge "migrate anim_diff tool from C++ to C89"
+96201e5 migrate anim_diff tool from C++ to C89
+945cfa3 mux.h does NOT need to include encode.h
+8da07e8 Merge "~2x faster SSE2 RGB24toY, BGR24toY, ARGBToY|UV"
+bfd3fc0 ~2x faster SSE2 RGB24toY, BGR24toY, ARGBToY|UV
+0243242 man/cwebp.1, cosmetics: escape '-'s
+96f5b42 man/cwebp: group lossy-only options
+52fdbdf extract some RGB24 to Luma conversion function from enc/ to dsp/
+ab8c230 add missing \n
+8304179 sync NEWS with 0.4.4
+5bd04a0 sync versions with 0.4.4
+8f1fcc1 Merge "Move ARGB->YUV functions from dec/vp8l.c to dsp/yuv.c"
+25bf2ce fix some warning about unaligned 32b reads
+922268f s/TIFF/WebP
+fa8927e Move ARGB->YUV functions from dec/vp8l.c to dsp/yuv.c
+9b37359 Merge "for ReadXXXX() image-readers, use the value of pic->use_argb"
+f7c507a Merge "remove unnecessary #include "yuv.h""
+7861578 for ReadXXXX() image-readers, use the value of pic->use_argb
+14e4043 remove unnecessary #include "yuv.h"
+469ba2c vwebp: fix incorrect clipping w/NO_BLEND
+4b9186b update issue tracker url (master)
+d64d376 change WEBP_ALIGN_CST value to 31
+f717b82 vp8l.c, cosmetics: fix indent after 95509f9
+927ccdc Merge "fix alignment of allocated memory in AllocateTransformBuffer"
+fea94b2 fix alignment of allocated memory in AllocateTransformBuffer
+5aa8d61 Merge "MIPS: rescaler code synced with C implementation"
+e7fb267 MIPS: rescaler code synced with C implementation
+93c86ed Merge "format_constants.h: MKFOURCC, correct cast"
+5d791d2 format_constants.h: MKFOURCC, correct cast
+65726cd dsp/lossless: Average2, make a constant unsigned
+d26d9de Use __has_builtin to check clang support
+12ec204 moved ALIGN_CST into util/utils.h and renamed WEBP_ALIGN_xxx
+a264083 Merge "rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand"
+3fb600d Merge "wicdec: fix alpha detection w/64bpp BGRA/RGBA"
+67c547f rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand
+99e3f81 Merge "large re-organization of the delta-palettization code"
+95509f9 large re-organization of the delta-palettization code
+74fb458 fix for weird msvc warning message
+ae49ad8 Merge "SSE2 implementation of ImportRowShrink"
+932fd4d SSE2 implementation of ImportRowShrink
+badfcba wicdec: fix alpha detection w/64bpp BGRA/RGBA
+35cafa6 Merge "iosbuild: fix linking with Xcode 7 / iOS SDK 9"
+b0c9d8a label rename: NO_CHANGE -> NoChange
+b4e731c neon-implementation for rescaler code
+db1321a iosbuild: fix linking with Xcode 7 / iOS SDK 9
+6dfa5e3 rescaler: better handling of the fxy_scale=0 special case.
+55c0529 Revert "rescaler: better handling of the fxy_scale=0 special case."
+9f226bf rescaler: better handling of the fxy_scale=0 special case.
+f7b8f90 delta_palettization.*: add copyright
+c1e1b71 Changed delta palette to compress better
+0dd2826 Merge "Add delta_palettization feature to WebP"
+48f66b6 Add delta_palettization feature to WebP
+27933e2 anim_encoder: drop a frame if it has same pixels as the prev frame.
+df9f6ec Merge "webpmux/DisplayInfo: send non-error output to stdout"
+8af4993 Merge "rescaler_mips_dsp_r2: cosmetics, fix indent"
+2b9d249 Merge "rescaler: cosmetics, join two lines"
+cc020a8 webpmux/DisplayInfo: send non-error output to stdout
+a288e74 configure: add -Wshorten-64-to-32
+c4c3cf2 pngdec: fix type conversion warnings
+bef8e97 webpmux: fix type conversion warning
+5a84460 rescaler_mips_dsp_r2: cosmetics, fix indent
+acde0aa rescaler: cosmetics, join two lines
+306ce4f rescaler: move the 1x1 or 2x1 handling one level up
+cced974 remove _mm_set_epi64x(), which is too specific
+56668c9 fix warnings about uint64_t -> uint32_t conversion
+76a7dc3 rescaler: add some SSE2 code
+1df1d0e rescaler: harmonize function protos
+9ba1894 rescaler: simplify ImportRow logic
+5ff0079 fix rescaler vertical interpolation
+cd82440 VP8LAllocateHistogramSet: align histogram[] entries
+a406b1d Merge "fix memory over-allocation in lossless rescaler init"
+0fde33e add missing const in VP8InitFrame signature
+ac7d5e8 fix memory over-allocation in lossless rescaler init
+017f8cc Loosen the buffer size checks for Y/U/V/A too.
+15ca501 loosen the padding check on buffer size
+d623a87 dec_neon: add whitespace around stringizing operator
+29377d5 dsp/mips: cosmetics: add whitespace around XSTR macro
+eebaf97 dsp/mips: add whitespace around stringizing operator
+d39dc8f Create a WebPAnimDecoder API.
+03fb752 gif2webp: print output file size
+14efabb Android: limit use of cpufeatures
+7b83adb preparatory cosmetics for Rescaler code fix and clean-up
+77fb41c dec/vp8l/DecodeAlphaData: remove redundant cast
+90fcfcd Insert less hash chain entries from the beginnings of long copies.
+bd55604 SSE2: add yuv444 converters, re-using yuv_sse2.c
+41a5d99 add a -quiet option to 'dwebp'
+80ab3ed Merge "README: update dwebp help output after 1e595fe"
+32b71b2 README: update dwebp help output after 1e595fe
+3ec1182 use the DispatchAlpha() call from dsp
+c5f0062 incorporate bzero() into WebPRescalerInit() instead of call site
+3ebcdd4 remove duplicate "#include <stdlib.h>"
+1e595fe dwebp: add -resize as a synonym for -scale
+24a9693 dec: allow 0 as a scaling dimension
+b918724 utils/rescaler: add WebPRescalerGetScaledDimensions
+923e8ed Merge "update NEWS"
+020fd09 Merge "WebPPictureDistortion: support ARGB format for 'pic' when computing distortion."
+6a5292f update NEWS
+56a2e9f WebPPictureDistortion: support ARGB format for 'pic' when computing distortion.
+0ae582e configure: test and add -Wunreachable-code
+c2f9dc0 bit_writer: convert VP8L macro values to immediates
+b969f88 Reduce magic in palette reordering
+acb297e anim_diff: add a -raw_comparison flag
+155c1b2 Merge changes I76f4d6fe,I45434639
+717e4d5 mips32/mipsDSPr2: function ImportRow rebased
+7df9389 fix rescaling bug (uninitialized read, see bug #254).
+5cdcd56 lossless_enc_neon: add VP8LTransformColor
+a53c336 lossless_neon: add VP8LTransformColorInverse
+99131e7 Merge changes I9fb25a89,Ibc648e9e
+c455676 simplify the main loop for downscaling
+2a010f9 lossless_neon: remove predictors 5-13
+ca221bb ll_enc_neon: enable VP8LSubtractGreenFromBlueAndRed
+585d93d Container spec: clarify ordering of ALPH chunk.
+01d61fd lossless: ~20 % speedup
+f722c8f lossless: Speed up ComputeCacheEntropy by 40 %
+1ceecdc add a VP8LColorCacheSet() method for color cache
+17eb609 lossless: Allow copying from prev row in rle-mode.
+f3a7a5b lossless: bit writer optimization
+d97b9ff Merge changes from topic 'lossless-enc-improvements'
+0250dfc msvc: fix pointer type warning in BitsLog2Floor
+52931fd lossless: combine the Huffman code with extra bits
+c4855ca lossless: Inlining add literal
+8e9c94d lossless: simplify HashChainFindCopy heuristics
+888429f lossless: 0.5 % compression density improvement
+7b23b19 lossless: Add zeroes into the predicted histograms.
+85b44d8 lossless: encoding, don't compute unnecessary histo
+d92453f lossless: Remove about 25 % of the speed degradation
+2cce031 Faster alpha coding for webp
+5e75642 lossless: rle mode not to accept lengths smaller than 4.
+84326e4 lossless: Less code for the entropy selection
+16ab951 lossless: 0.37 % compression density improvement
+822f113 add WebPFree() to the API
+0ae2c2e SSE2/SSE41: optimize SSE_16xN loops
+39216e5 cosmetics: fix indent after 32462a07
+559e54c Merge "SSE2: slightly faster FTransformWHT"
+8ef9a63 SSE2: slightly faster FTransformWHT
+f27f773 lossless_neon: enable VP8LAddGreenToBlueAndRed
+36e9c4b SSE2: minor cosmetrics on in-loop filter code
+4741fac dsp/lossless_*sse2: remove some unnecessary inlines
+1819965 fix warning ("left shift of negative value") using a cast
+7017001 SSE2: speed-up some lossless-encoding functions
+abcb012 Merge "SSE2: slightly faster (~5%) AddGreenToBlueAndRed()"
+2df5bd3 Merge "Speedup to HuffmanCostCombinedCount"
+9e356d6 SSE2: slightly faster (~5%) AddGreenToBlueAndRed()
+fc6c75a SSE2: 53% faster TransformColor[Inverse]
+49073da SSE2: 46% speed-up of TransformColor[Inverse]
+32462a0 Speedup to HuffmanCostCombinedCount
+f3d687e SSE4.1 implementation of some lossless encoding functions
+bfc300c SSE4.1 implementation of some alpha-processing functions
+7f9c98f Merge "sse2 in-loop: simplify SignedShift8b() a bit"
+ef314a5 dec_sse2/GetNotHEV: micro optimization
+a729cff sse2 in-loop: simplify SignedShift8b() a bit
+422ec9f simplify Load8x4() a bit
+8df238e Merge "remove some duplicate FlipSign()"
+751506c remove some duplicate FlipSign()
+65ef5af Merge "lossless: 0.13% compression density gain"
+2beef2f lossless: 0.13% compression density gain
+3033f24 lossless: 0.06 % compression density improvement
+64960da dec_neon: add VE8uv / VE16
+14dbd87 dec_neon: add HE8uv / HE16
+ac76801 introduce FTransform2 to perform two transforms at a time.
+aa6065a dec_neon: use vld1_dup(mem) rather than vdup(mem[0])
+8b63ac7 Merge "dec_neon: add TM16"
+f51be09 Merge "dec_neon/TrueMotion: simply left border load"
+dc48196 dec_neon: add TM16
+ea95b30 dec_neon/TrueMotion: simply left border load
+f262d61 speed-up SetResidualSSE2
+bf46d0a fix mips2 build target
+929a0fd enc_sse2/TTransform: simplify abs calculation
+17dbd05 enc_sse2/CollectHistogram: simplify abs calculation
+a6c1593 dec_neon: add DC16 intra predictors
+03b4f50 Makefile.vc: add anim_diff build support.
+1b98987 Merge changes I9cd84125,Iee7e387f,I7548be72
+acd7b5a Introduce a test tool anim_diff.
+f274a96 dsp/enc_sse2: add luma4 intra predictors
+040b11b dsp/enc_sse2: add chroma intra predictors
+aee021b dsp/enc_sse2: add luma16 intra predictors
+9e00a49 makefile.unix: remove superclean target
+cefc9c0 makefile.unix: clean up after extras target
+4c9af02 dec_neon: add DC8uvNoTopLeft
+dd55b87 Merge "doc/webp-container-spec: update repo browser link"
+f048696 doc/webp-container-spec: update repo browser link
+9287761 Merge "GetResidualCostSSE2: simplify abs calculation"
+0e00936 dsp/cpu.c(x86): check maximum supported cpuid feature
+b243a4b GetResidualCostSSE2: simplify abs calculation
+6d4602b Merge "fix typo: constitutes -> constitute"
+5fe1fe3 fix typo: constitutes -> constitute
+b83bd7c Merge "populate 'libwebpextras' with: import gray, rgb565 and rgb4444 functions"
+b0114a3 Merge "histogram.h: cosmetics: remove unnecessary includes"
+feab45e gifdec: Move inclusion of webp/config.h to header.
+dbba67d histogram.h: cosmetics: remove unnecessary includes
+e978fec Merge "VP8LBitReader: fix remaining ubsan error with large shifts"
+d6fe588 Merge "ReconstructRow: move some one-time inits out of the main loop"
+a21d647 ReconstructRow: move some one-time inits out of the main loop
+7a01c3c VP8LBitReader: fix remaining ubsan error with large shifts
+7fa67c9 change GetPixPairHash64() return type to uint32_t
+ec1fb9f Merge "dsp/enc.c: cosmetics: move DST() def closer to use"
+7073bfb Merge "split 64-mult hashing into two 32-bit multiplies"
+0768b25 dsp/enc.c: cosmetics: move DST() def closer to use
+6a48b8f Merge "fix MSVC size_t->int conversion warning"
+1db07cd Merge "anim_encode: cosmetics: fix alignment"
+e28271a anim_encode: cosmetics: fix alignment
+7fe357b split 64-mult hashing into two 32-bit multiplies
+af74c14 populate 'libwebpextras' with: import gray, rgb565 and rgb4444 functions
+6121413 remove VP8Residual::cost unused field
+e254482 fix MSVC size_t->int conversion warning
+b69a6c3 vwebp: don't redefine snprintf with VS2015+
+0ac29c5 AnimEncoder API: Consistent use of trailing underscores in struct.
+d484555 AnimEncoder API: Use timestamp instead of duration as input to Add().
+9904e36 dsp/dec_sse2: DC8uv / DC8uvNoLeft speedup
+7df2049 dsp/dec_sse2: DC16 / DC16NoLeft speedup
+8e515df Merge "makefile.unix: add some missing headers"
+db12250 cosmetics: vp8enci.h: break long line
+bf516a8 makefile.unix: add some missing headers
+b44eda3 dsp: add DSP_INIT_STUB
+03e76e9 clarify the comment about double-setting the status in SetError()
+9fecdd7 remove unused EmitRGB()
+43f010d move ReconstructRow to top
+82d9802 add a dec/common.h header to collect common enc/dec #defines
+5d4744a Merge "enc_sse41: add Disto4x4 / Disto16x16"
+e38886a mux.h: Bump up ABI version
+46305ca configure: add --disable-<avx2|sse4.1|sse2>
+2fc8b65 CPPFLAGS->CFLAGS for detecting sse4.1 in preprocessor
+1a338fb enc_sse41: add Disto4x4 / Disto16x16
+9405550 encoding SSE4.1 stub for StoreHistogram + Quantize + SSE_16xN
+c64659e remove duplicate variables after the lossless{_enc}.c split
+67ba7c7 enc_sse2: call local FTransform in CollectHistogram
+1824979 dsp: s/VP8LSetHistogramData/VP8SetHistogramData/
+ede5e15 cosmetics: dsp/lossless.h: reorder prototypes
+553051f dsp/lossless: split enc/dec functions
+9064adc Merge "conditionally add -msse4.1 in Makefile.unix"
+cecf509 dsp/yuv*.c: rework WEBP_USE_<arch> ifdef
+6584d39 dsp/upsampling*.c: rework WEBP_USE_<arch> ifdef
+8080942 dsp/rescaler*.c: rework WEBP_USE_<arch> ifdef
+1d93dde dsp/lossless*.c: rework WEBP_USE_<arch> ifdef
+73805ff dsp/filters*.c: rework WEBP_USE_<arch> ifdef
+fbdcef2 dsp/enc*.c: rework WEBP_USE_<arch> ifdef
+66de69c dsp/dec*.c: rework WEBP_USE_<arch> ifdef
+48e4ffd dsp/cost*.c: rework WEBP_USE_<arch> ifdef
+29fd6f9 dsp/argb*.c: rework WEBP_USE_<arch> ifdef
+80ff381 dsp/alpha*.c: rework WEBP_USE_<arch> ifdef
+bf09cf1 conditionally add -msse4.1 in Makefile.unix
+e9570dd stub for SSE4.1 support.
+4a95384 Merge "dsp: add sse4.1 detection"
+cabf4bd dsp: add sse4.1 detection
+4ecba1a thread.h: rename interface param
+b8d706c Merge "sync versions with 0.4.3"
+ae64a71 Merge "add shell for libwebpextras"
+92a5da9 sync versions with 0.4.3
+9d4e2d1 Merge "~30% faster smart-yuv (-pre 4) with early-out criterion"
+b1bdbba ~30% faster smart-yuv (-pre 4) with early-out criterion
+7efb974 Merge "Disable NEON code on Native Client"
+ac4f578 Disable NEON code on Native Client
+0873f85 AnimEncoder API: Support input frames in YUV(A) format.
+5c176d2 add shell for libwebpextras
+44bd956 fix signature for VP8RecordCoeffTokens()
+c9b8ea0 small cosmetics on TokenBuffer.
+76394c0 Merge "MIPS: dspr2: added optimization for TrueMotion"
+0f77369 WebPPictureRescale: add a note about 0 width/height
+241bb5d MIPS: dspr2: added optimization for TrueMotion
+6cef0e4 examples/Android.mk: add webpmux_example target
+53c16ff Android.mk: add webpmux target
+21852a0 Android.mk: add webpdemux target
+8697a3b Android.mk: add webpdecoder{,_static} targets
+4a67049 Android.mk: split source lists per-directory
+b5e7942 MIPS: dspr2: Added optimization for some convert functions
+0f595db MIPS: dspr2: Added optimization for some convert functions
+8a218b4 MIPS: [mips32|dspr2]: GetResidualCost rebased
+ef98750 Speedup method StoreImageToBitMask by 5%.
+602a00f fix iOS arm64 build with Xcode 6.3
+2382050 1-2% faster encoding by removing an indirection in GetResidualCost()
+eddb7e7 MIPS: dspr2: added otpimization for DC8uv, DC8uvNoTop and DC8uvNoLeft
+73ba291 MIPS: dspr2: added optimization for functions RD4 and LD4
+c7129da Merge "4-5% faster encoding using SSE2 for GetResidualCost"
+94380d0 MIPS: dspr2: added optimizaton for functions VE4 and DC4
+2a40709 4-5% faster encoding using SSE2 for GetResidualCost
+17e1986 Merge "MIPS: dspr2: added optimization for simple filtering functions"
+3ec404c Merge "dsp: normalize WEBP_TSAN_IGNORE_FUNCTION usage"
+b969f5d dsp: normalize WEBP_TSAN_IGNORE_FUNCTION usage
+d7b8e71 MIPS: dspr2: added optimization for simple filtering functions
+235f774 Merge "MIPS: dspr2: Added optimization for function VP8LTransformColorInverse_C"
+42a8a62 MIPS: dspr2: Added optimization for function VP8LTransformColorInverse_C
+b442bef Merge "ApplyFiltersAndEncode: only copy lossless stats"
+b510fbf doc/webp-container-spec: note MSB order for chunk diagrams
+9bc0f92 ApplyFiltersAndEncode: only copy lossless stats
+3030f11 Merge "dsp/mips: add some missing TSan annotations"
+dfcf459 Merge "MIPS: dspr2: Added optimization for function VP8LAddGreenToBlueAndRed_C"
+55c75a2 dsp/mips: add some missing TSan annotations
+2cb879f MIPS: dspr2: Added optimization for function VP8LAddGreenToBlueAndRed_C
+e155601 move some cost tables from enc/ to dsp/
+c3a0316 Merge "picture_csp: fix build w/USE_GAMMA_COMPRESSION undefined"
+39537d7 Merge "VP8LDspInitMIPSdspR2: add missing TSan annotation"
+1dd419c picture_csp: fix build w/USE_GAMMA_COMPRESSION undefined
+43fd354 VP8LDspInitMIPSdspR2: add missing TSan annotation
+c7233df Merge "VP8LDspInit: remove memcpy"
+0ec4da9 picture_csp::InitGammaTables*: add missing TSan annotations
+35579a4 VP8LDspInit: remove memcpy
+97f6aff VP8YUVInit: add missing TSan annotation
+f9016d6 dsp/enc::InitTables: add missing TSan annotation
+e3d9771 VP8EncDspCostInit*: add missing TSan annotations
+d97c143 Merge "doc/webp-container-spec: cosmetics"
+309b790 MIPS: mips32: Added optimization for function SetResidualCoeffs
+a987fae MIPS: dspr2: added optimization for function GetResidualCost
+e7d3df2 doc/webp-container-spec: cosmetics
+be6635e Merge "VP8TBufferClear: remove some misleading const's"
+02971e7 Merge "VP8EmitTokens: remove unnecessary param void cast"
+3b77e5a VP8TBufferClear: remove some misleading const's
+aa139c8 VP8EmitTokens: remove unnecessary param void cast
+c24d8f1 cosmetics: upsampling_sse2: add const to some casts
+1829c42 cosmetics: lossless_sse2: add const to some casts
+183168f cosmetics: enc_sse2: add const to some casts
+860badc cosmetics: dec_sse2: add const to some casts
+0254db9 cosmetics: argb_sse2: add const to some casts
+1aadf85 cosmetics: alpha_processing_sse2: add const to some casts
+1579de3 vwebp: clear canvas at the beginning of each loop
+4b9fa5d Merge "webp-container-spec: clarify background clear on loop"
+4c82284 Updated the near-lossless level mapping.
+5603947 webp-container-spec: clarify background clear on loop
+19f0ba0 Implement true-motion prediction in SSE2
+774d4cb make VP8PredLuma16[] array non-const
+d7eabb8 Merge "MIPS: dspr2: Added optimization for function CollectHistogram"
+fe42739 Use integers for kmin/kmax for simplicity.
+b9df35f AnimEncode API: kmax=0 should imply all keyframes.
+6ce296d MIPS: dspr2: Added optimization for function CollectHistogram
+2c906c4 vwebp: remove unnecessary static Help() prototype
+be0fd1d Merge "dec/vp8: clear 'dither_' on skipped blocks"
+e96170f Merge "vwebp/animation: display last frame on end-of-loop"
+0f017b5 vwebp/animation: display last frame on end-of-loop
+c86b40c enc/near_lossless.c: fix alignment
+66935fb dec/vp8: clear 'dither_' on skipped blocks
+b7de794 Merge "lossless_neon: enable subtract green for aarch64"
+77724f7 SSE2 version of GradientUnfilter
+416e1ce lossless_neon: enable subtract green for aarch64
+72831f6 Speedup AnalyzeAndInit for low effort compression.
+a659748 Speedup Analyze methods for lossless compression.
+98c8138 Enable Near-lossless feature.
+c6b2454 AnimEncoder API: Fix for kmax=1 and default kmin case.
+022d2f8 add SSE2 variants for alpha filtering functions
+2db15a9 Temporarily disable encoding of alpha plane with color cache.
+1d575cc Merge "Lossless decoding: Remove an unnecessary if condition."
+cafa1d8 Merge "Simplify backward refs calculation for low-effort."
+7afdaf8 Alpha coding: reorganize the filter/unfiltering code
+4d6d728 Simplify backward refs calculation for low-effort.
+ec0d1be Cleaup Near-lossless code.
+9814ddb Remove the post-transform near-lossless heuristic.
+4509e32 Lossless decoding: Remove an unnecessary if condition.
+f2ebc4a Merge "Regression fix for lossless decoding"
+783a8cd Regression fix for lossless decoding
+9a062b8 AnimEncoder: Bugfix for kmin = 1 and kmax = 2.
+0f027a7 simplify smart RGB->YUV conversion code
+0d5b334 BackwardReferencesHashChainFollowChosenPath: remove unused variable
+f480d1a Fix to near lossless artefacts on palettized images.
+d4615d0 Merge changes Ia1686828,I399fda40
+cb4a18a rename HashChainInit into HashChainReset
+f079e48 use uint16_t for chosen_path[]
+da09121 MIPS: dspr2: Added optimization for function FTransformWHT
+b8c2013 Merge "wicdec: (msvs) quiet some /analyze warnings"
+9b228b5 wicdec: (msvs) quiet some /analyze warnings
+daeb276 Merge "MIPS: dspr2: Added optimization for MultARGBRow function"
+cc08742 Merge "dsp/cpu: (msvs) add include for __cpuidex"
+4a82aab Merge changes I87544e92,I0bb6cda5
+7a19139 dwebp/WritePNG: mark png variables volatile
+775dfad dwebp: include setjmp.h w/WEBP_HAVE_PNG
+47d26be dwebp: correct sign in format strings
+f0e0677 VP8LEncodeStream: add an assert
+c5f7747 VP8LColorCacheCopy: promote an int before shifting
+0de5f33 dsp/cpu: (msvs) add include for __cpuidex
+7d850f7 MIPS: dspr2: Added optimization for MultARGBRow function
+5487529 MIPS: dspr2: added optimization for function QuantizeBlock
+4fbe9cf dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
+3fd5903 simplify/reorganize arguments for CollectColorBlueTransforms
+b9e356b Disable costly TraceBackwards for method=0.
+a7e7caa MIPS: dspr2: added optimization for function TransformColorRed
+2cb3918 Merge "MIPS: dspr2: added optimization for function TransformColorBlue"
+279e661 Merge "dsp/cpu: add include for _xgetbv() w/MSVS"
+b6c0428 dsp/cpu: add include for _xgetbv() w/MSVS
+d1c4ffa gif2webp: Move GIF decoding related code to a support library.
+07c3955 Merge "AnimEncoder API: Add info in README.mux"
+7b16197 MIPS: dspr2: added optimization for function TransformColorBlue
+d7c4b02 cpu: fix AVX2 detection for gcc/clang targets
+9d29946 AnimEncoder API: Add info in README.mux
+d581ba4 follow-up: clean up WebPRescalerXXX dsp function
+f8740f0 dsp: s/USE_INTRINSICS/WEBP_USE_INTRINSICS/
+ce73abe Merge "introduce a separate WebPRescalerDspInit to initialize pointers"
+ab66bec introduce a separate WebPRescalerDspInit to initialize pointers
+205c7f2 fix handling of zero-sized partition #0 corner case
+cbcdd5f Merge "move rescaler functions to rescaler* files in src/dsp/"
+bf586e8 Merge changes I230b3532,Idf3057a7
+6dc79dc Merge "anim_encode: fix type conversion warnings"
+11fce25 Merge "dec_neon: remove returns from void functions"
+c4e63f9 Makefile.vc: add gif2webp target
+4f43d38 enable NEON for Windows ARM builds
+3f6615a Makefile.vc: add rudimentary Windows ARM support
+e7c5954 dec_neon: remove returns from void functions
+f79c163 anim_encode: fix type conversion warnings
+0f54f1e Remove gif2webp_util which is no longer needed.
+cbcbedd move rescaler functions to rescaler* files in src/dsp/
+ac79ed1 webpmux: remove experimental fragment handling
+e8694d4 mux: remove experimental FRGM parsing
+9e92b6e AnimEncoder API: Optimize single-frame animated images
+abbae27 Merge "Move over gif2webp to the new AnimEncoder API."
+a28c4b3 MIPS: move WORK_AROUND_GCC define to appropriate place
+012d2c6 MIPS: dspr2: added optimization for functions SSEAxB
+67720c8 Move over gif2webp to the new AnimEncoder API.
+9241ecf MIPS: dspr2: added optimization for function Average
+9422211 Merge "Tune BackwardReferencesLz77 for low_effort (m=0)."
+df40057 Merge "Speedup VP8LGetHistoImageSymbols for low effort (m=0) mode."
+ea08466 Tune BackwardReferencesLz77 for low_effort (m=0).
+b0b973c Speedup VP8LGetHistoImageSymbols for low effort (m=0) mode.
+c6d3292 argb_sse2: cosmetics
+67f601c make the 'last_cpuinfo_used' variable names unique
+b948986 AnimEncoder API: Init method for default options.
+856f8ec Merge "AnimEncoder API: Remove AnimEncoderFrameOptions."
+c537514 Merge "AnimEncoder API: GenerateCandidates bugfix."
+dc0ce03 Merge "AnimEncoder API: Compute change rectangle for first frame too."
+f00b639 Merge "AnimEncoder API: In Assemble(), always set animation parameters."
+29ed796 Merge "AnimEncoder lib cleanup: prev to prev canvas not needed."
+9f0dd6e Merge "WebPAnimEncoder API: Header and implementation"
+5e56bbe AnimEncoder API: Remove AnimEncoderFrameOptions.
+b902c3e AnimEncoder API: GenerateCandidates bugfix.
+ef3c39b AnimEncoder API: Compute change rectangle for first frame too.
+eec423a AnimEncoder API: In Assemble(), always set animation parameters.
+ae1c046 AnimEncoder lib cleanup: prev to prev canvas not needed.
+4b997ae WebPAnimEncoder API: Header and implementation
+72208be move argb_*.o build target to encoder list
+9592053 Merge "multi-thread fix: lock each entry points with a static var"
+4c1b300 Merge "SSE2 implementation of VP8PackARGB"
+fbcc200 Merge "add -Wformat-nonliteral and -Wformat-security"
+80d950d add -Wformat-nonliteral and -Wformat-security
+04c20e7 Merge "MIPS: dspr2: added optimization for function Intra4Preds"
+a437694 multi-thread fix: lock each entry points with a static var
+ca7f60d SSE2 implementation of VP8PackARGB
+72d573f simplify the PackARGB signature
+4e2589f demux: restore strict fragment flag check
+4ba8e07 Merge "webp-container-spec: remove references to fragments"
+e752f0a Merge "demux: remove experimental FRGM parsing"
+f8abb11 Merge changes I109ec4d9,I73fe7743
+ae2188a MIPS: dspr2: added optimization for function Intra4Preds
+1f4b864 move VP8EncDspARGBInit() call closer to where it's needed
+14108d7 dec_neon: add DC8uvNoTop / DC8uvNoLeft
+d8340da dec_neon: add DC8uv
+a66e66c webp-container-spec: remove references to fragments
+7ce8788 MIPS: dspr2: added optimization for function MakeARGB32
+012e623 demux: remove experimental FRGM parsing
+87c3d53 method=0: Don't evaluate any predictor
+6f4fcb9 Merge "MIPS: dspr2: added optimization for function ImportRow"
+2428445 replace unneeded calls to HistogramCopy() by swaps
+bdf7b40 MIPS: dspr2: added optimization for function ImportRow
+e66a922 Merge "MIPS: dspr2: added optimization for function ExportRowC"
+c279fec MIPS: dspr2: added optimization for function ExportRowC
+31a9cf6 Speedup WebP lossless compression for low effort (m=0) mode with following: - Disable Cross-Color transform. - Evaluate predictors #11 (paeth), #12 and #13 only.
+9275d91 MIPS: dspr2: added optimization for function TrueMotion
+26106d6 Merge "enc_neon: fix building with non-Xcode clang (iOS)"
+1c4e3ef unroll the kBands[] indirection to remove a dereference in GetCoeffs()
+a3946b8 enc_neon: fix building with non-Xcode clang (iOS)
+8ed9c00 Merge "simplify the Histogram struct, to only store max_value and last_nz"
+bad7757 simplify the Histogram struct, to only store max_value and last_nz
+3cca0dc MIPS: dspr2: Added optimization for DCMode function
+37e395f MIPS: fix functions to use generic BPS istead of hardcoded value
+9475bef PickBestUV: fix VP8Copy16x8 invocation
+441f273 Merge changes I55f8da52,Id73a1e96
+4a279a6 cosmetics: add some missing != NULL comparisons
+66ad372 factorize BPS definition in dsp.h and add VP8Copy16x8
+432e5b5 make ALIGN_xxx naming consistent
+5760604 encoder: switch BPS to 32 instead of 16
+1b66bbe MIPS: dspr2: added optimization for function TransformColor_C
+c6d0f9e histogram: cosmetics
+f399d30 Merge changes I6eac17e5,I32d2b514
+9de9074 dec_neon: add TM8uv
+8e517ec bit_reader/kVP8NewRange: range_t -> uint8_t
+e185713 dsp: initialize VP8PredChroma8 in VP8DspInit()
+e0c809a Move Entropy methods to lossless.c
+a96ccf8 iosbuild: add x64_64 simulator support
+a0df551 Remove handling for WEBP_HINT_GRAPH
+413dfc0 Move static method definition before its usage.
+0f23566 Update BackwardRefsWithLocalCache.
+d69e36e Remove TODOs from lossless encoder code.
+fdaac8e Optmize VP8LGetBackwardReferences LZ77 references.
+2f0e2ba MIPS: dspr2: added optimization for function Select
+a3e79a4 Merge "WebPEncode: Support encoding same pic twice (even if modified)"
+e4f4ddd WebPEncode: Support encoding same pic twice (even if modified)
+cbc3fbb Merge "Updated VP8LGetBackwardReferences and color cache."
+95a9bd8 Updated VP8LGetBackwardReferences and color cache.
+54f2c14 MIPS: dspr2: added optimization for function FTransform
+aa42f42 MIPS: dspr2: Added optimization for function VP8LSubtractGreenFromBlueAndRed
+11a25f7 Merge "FlattenSimilarBlocks should only be tried when blending is possible."
+5cccdad FlattenSimilarBlocks should only be tried when blending is possible.
+95ca44a MIPS: dspr2: added optimization for Disto4x4
+4171b67 backward_references.c: reindent after c8581b0
+c8581b0 Optimize BackwardReferences for RLE encoding.
+5798eee MIPS: dspr2: unfilters bugfix (Ie7b7387478a6b5c3f08691628ae00f059cf6d899)
+4167a3f Optimize backwardreferences
+d18554c Merge "webp/types.h: use inline for clang++/-std=c++11"
+7489b0e gif2webp: Add '-min-size' option to get best compression.
+77bdddf Speed up BackwardReferences
+6638710 webp/types.h: use inline for clang++/-std=c++11
+abf0420 Enable entropy based merge histo for (q<100)
+572022a filters_mips_dsp_r2.c: disable unfilters
+a28e21b MIPS: dspr2: Added optimization for function ClampedAddSubtractFull
+18d5a1e MIPS: dspr2: added optimization for function ClampedAddSubtractHalf
+829a8c1 MIPS: dspr2: added optimization for ITransform
+c94ed49 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
+653ace5 Increase the MAX_COLOR_CACHE_BITS from 9 to 10.
+919220c Change the logic adjusting the Histogram bits.
+53b096c Merge "Fix bug in VP8LCalculateEstimateForCacheSize."
+e912bd5 Fix bug in VP8LCalculateEstimateForCacheSize.
+541d783 Merge "dec_neon: add RD4 intra predictor"
+f8cd067 Merge "Makefile.vc: add a 'legacy' RTLIBCFG option"
+22881c9 dec_neon: add RD4 intra predictor
+613d281 update NEWS
+1304eb3 Merge "dec_neon: DC4: use pair-wise adds for top row"
+34c20c0 Makefile.vc: add a 'legacy' RTLIBCFG option
+7083006 Merge "dsp/dec_{neon,sse2}: VE4: normalize variable names"
+0db9031 dsp/dec_{neon,sse2}: VE4: normalize variable names
+b5bc153 dec_neon: DC4: use pair-wise adds for top row
+5b90d8f Unify the API between VP8BitWriter and VP8LBitWriter
+f7ada56 Merge changes I2e06907b,Ia9ed4ca6,I782282ff
+5beb6bf Merge "dec_neon: add VE4 intra predictor"
+eba6ce0 dec_neon: add DC4 intra predictor
+79abfbd dec_neon: add TM4 intra predictor
+fe395f0 dec_neon: add LD4 intra predictor
+32de385 dec_neon: add VE4 intra predictor
+72395ba Merge "Modify CostModel to allocate optimal memory."
+65e5eb8 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
+e4c829e gif2webp: Handle frames with odd offsets + disposal to background.
+c2b5a03 Modify CostModel to allocate optimal memory.
+b7a33d7 implement VE4/HE4/RD4/... in SSE2
+97c76f1 make VP8PredLuma4[] non-const and initialize array in VP8DspInit()
+0ea8c6c Merge "PrintReg: output to stderr"
+d7ff2f9 Merge "stopwatch.h: fix includes"
+f85ec71 PrintReg: output to stderr
+54edbf6 stopwatch.h: fix includes
+139142e Optimize BackwardReferenceHashChainFollowPath.
+5f36b68 enc/backward_references.c: fix indent
+e0e9960 Merge "sync version numbers to 0.4.2 release"
+64ac514 sync version numbers to 0.4.2 release
+c24f895 Simplify and speedup Backward refs computation.
+d1c359e fix shared object build with -fvisibility=hidden
+a4c3a31 WEBP_TSAN_IGNORE_FUNCTION: fix gcc compat warning
+f358eeb add code for testing random incremental decoding in dwebp
+8024729 mark some init function as being safe for thread_sanitizer.
+79b5bdb bit_reader.h: cosmetics: fix a typo
+6c67368 Improved near-lossless mode.
+0ce27e7 enc_mips32: workaround gcc-4.9 bug
+aca1b98 enc/vp8l.c: fix indent
+ca00502 Evaluate non-palette compression for palette image
+c8a87bb AssignSegments: quiet -Warray-bounds warning
+32f67e3 Merge "enc_neon: initialize vectors w/vdup_n_u32"
+fabc65d 1-3% faster encoding optimizing SSE_NxN functions
+7534d71 enc_neon: initialize vectors w/vdup_n_u32
+5f81391 Merge "Fix return code of EncodeImageInternal()"
+e321abe Fix return code of EncodeImageInternal()
+f82cb06 optimize palette ordering
+f545fee don't set the alpha value for histogram index image
+2d9b0a4 add WebPDispatchAlphaToGreen() to dsp
+1bd4c2a Merge "Change Entropy based Histogram Combine heuristic."
+e295b8f Merge "iosbuild: cleanup"
+1be4e76 Merge "iosbuild: output autoconf req. on failure"
+d5e498d Change Entropy based Histogram Combine heuristic.
+47a2d8e fix MSVC float->int conversion warning
+041956f iosbuild: cleanup
+767eb40 iosbuild: output autoconf req. on failure
+35ad48b HistoHeapInit: correct positions allocation size
+45d9635 lossless: entropy clustering for high qualities.
+dc37df8 fix type warning for VS9_x64
+9f7d9e6 iosbuild: make iOS 6 the minimum requirement
+fdd6528 Remove unused VP8LDecoder member variable
+ea3bba5 Merge "rewrite Disto4x4 in enc_neon.c with intrinsic"
+f060dfc add lossless incremental decoding support
+ab70794 rewrite Disto4x4 in enc_neon.c with intrinsic
+d447163 MIPS: dspr2: added optimization for function FilterLoop24
+2aef54d Merge "prepare VP8LDecodeImage for incremental decode"
+aed0f5a Merge "MIPS: dspr2: added optimization for function FilterLoop26"
+2863068 prepare VP8LDecodeImage for incremental decode
+248f3ae remove br->error_ field
+49e1504 MIPS: dspr2: added optimization for function FilterLoop26
+38128cb iobuild.sh: only install .h files in Headers
+c792d41 Premultiply with alpha during U/V downsampling
+0cc811d gif2webp: Background color correction
+d7167ff Amend the lossless spec according to issue #205, #206 and #224
+b901416 Record the lossless size stats.
+cddd334 Add a WebPExtractAlpha function to dsp
+0716a98 fix indent after I0204949917836f74c0eb4ba5a7f4052a4797833b
+f9ced95 Optimize lossless decoding for trivial(ARB) codes.
+924fcfd Merge "webpmux: simplify InitializeConfig()"
+c0a462c webpmux: simplify InitializeConfig()
+6986bb5 webpmux: fix indent
+f89e169 webpmux: fix exit status on numeric value parse error
+2172cb6 Merge "webpmux: fix loop_count range check"
+e3b343e Merge "examples: warn on invalid numeric parameters"
+0e23c48 webpmux: fix loop_count range check
+6208338 Merge "fix loop bug in DispatchAlpha()"
+d51f3e4 gif2webp: Handle frames with missing  graphic control extension
+690b491 fix loop bug in DispatchAlpha()
+96d43a8 examples: warn on invalid numeric parameters
+3101f53 MIPS: dspr2: added optimization for TransformOne
+a6bb9b1 SSE2 for inverse Mult(ARGB)Row and ApplyAlphaMultiply
+d84a8ff Remove default initialization of decoder status.
+be70b86 configure: simplify libpng-config invocation
+e0a9932 Rectify bug in lossless incremental decoding.
+e2502a9 MIPS: dspr2: added optimization for TransformAC3
+24e1072 MIPS: dspr2: added optimization for TransformDC
+c0e84df Merge "Slightly faster lossless decoding (1%)"
+8dd28bb Slightly faster lossless decoding (1%)
+f010359 MIPS: dspr2: added optimization for ColorIndexInverseTransforms
+d3242ae make VP8LSetBitPos() set br->eos_ flag
+a9decb5 Lossless decoding: fix eos_ flag condition
+3fea6a2 fix erroneous dec->status_ setting
+80b8099 MIPS: dspr2: add some specific mips code to commit I2c3f2b12f8df15b785fad5a9c56316e954ae0c53
+e564062 Merge "further refine the COPY_PATTERN optim for DecodeAlpha"
+854509f enc/histogram.c: reindent after f4059d0
+3442196 Merge "~3-5% faster encoding optimizing PickBestIntra*()"
+865069c further refine the COPY_PATTERN optim for DecodeAlpha
+a595622 added C-level optimization for DecodeAlphaData function
+187d379 add a fallback to ALPHA_NO_COMPRESSION
+a48a2d7 ~3-5% faster encoding optimizing PickBestIntra*()
+a614019 ExUtilReadFromStdin: (windows) open stdin in bin mode
+e80eab1 webpmux: (windows) open stdout in binary mode
+e9bfb11 cwebp: (windows) open stdout in binary mode
+5927e15 example_util: add ExUtilSetBinaryMode
+30f3b75 webpmux man page: Clarify some title, descriptions and examples
+77d4c7e address cosmetic comments from patch #71380
+f75dfbf Speed up Huffman decoding for lossless
+637b388 dsp/lossless: workaround gcc-4.9 bug on arm
+8323a90 dsp.h: collect gcc/clang version test macros
+e6c4b52 move static initialization of WebPYUV444Converters[] to the Init function.
+49911d4 Merge "fix indentation"
+f4059d0 Code cleanup for HistogramRemap.
+e632b09 fix indentation
+f5c04d6 Merge "add a DispatchAlpha() for SSE2 that handles 8 pixels at a time"
+fc98edd add a DispatchAlpha() for SSE2 that handles 8 pixels at a time
+73d361d introduce VP8EncQuantize2Blocks to quantize two blocks at a time
+0b21c30 MIPS: dspr2: added optimization for EmitAlphaRGB
+953acd5 enc_neon: enable QuantizeBlock for aarch64
+f4ae143 MIPS: mips32: code rebase
+5697715 MIPS: dspr2: added optimizations for VP8YuvTo*
+2523aa7 SmartRGBYUV: fix odd-width problem with pixel replication
+ee52dc4 fix some MSVC64 warning about float conversion
+3fca851 cpu: check for _MSC_VER before using msvc inline asm
+e2a83d7 faster RGB->YUV conversion function (~7% speedup)
+de2d03e Merge "Add smart RGB->YUV conversion option -pre 4"
+3fc4c53 Add smart RGB->YUV conversion option -pre 4
+b4dc406 MIPS: dspr2: added optimization for (un)filters
+137e609 Merge "configure: add work around for gcc-4.9 aarch64 bug"
+b61c9ce MIPS: dspr2: Optimization of some simple point-sampling functions
+e2b8cec configure: add work around for gcc-4.9 aarch64 bug
+98c5410 MIPS: mips32r2: added optimization for BSwap32
+dab702b Update PATENTS to reflect s/VP8/WebM/g
+b564f7c Merge "MIPS: detect mips32r6 and disable mips32r1 code"
+b7e5a5c MIPS: detect mips32r6 and disable mips32r1 code
+63c2fc0 Correctly use the AC_CANONICAL_* macros
+bb07022 Merge "cosmetics"
+e300c9d cosmetics
+0e519ee Merge "cosmetics: remove some extraneous 'extern's"
+3ef0f08 Merge "vp8enci.h: cosmetics: fix '*' placement"
+4c6dde3 bit_writer: cosmetics: rename kFlush() -> Flush()
+f7b4c48 cosmetics: remove some extraneous 'extern's
+b47fb00 vp8enci.h: cosmetics: fix '*' placement
+b5a36cc add -near_lossless [0..100] experimental option
+0524d9e dsp: detect mips64 & disable mips32 code
+d3485d9 cwebp.1: fix quality description placement
+29a9fe2 Merge tag 'v0.4.1'
 8af2771 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1)
+e09e9ff Record & log the image pre-processing time.
 f59c0b4 iosbuild.sh: specify optimization flags
 8d34ea3 update ChangeLog (tag: v0.4.1-rc1)
 dbc3da6 makefile.unix: add vwebp.1 to the dist target
--- a/Makefile.vc
+++ b/Makefile.vc
@ -11,6 +11,8 @@ LIBWEBPDEMUX_BASENAME = libwebpdemux
 ARCH = x86
 !ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
 ARCH = x64
+!ELSE IF ! [ cl 2>&1 | find "ARM" > NUL ]
+ARCH = ARM
 !ELSE
 !ERROR Unable to auto-detect toolchain architecture! \
 If cl.exe is in your PATH rerun nmake with ARCH=<arch>.
@ -29,7 +31,6 @@ CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
 CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
-CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
 LNKDLL     = link.exe /DLL $(NOLOGO)
@ -37,6 +38,12 @@ LNKEXE     = link.exe $(NOLOGO)
 LNKLIB     = lib.exe $(NOLOGO)
 MT         = mt.exe $(NOLOGO)

+!IF "$(ARCH)" == "ARM"
+CFLAGS = $(CFLAGS) /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP /DWEBP_USE_THREAD
+!ELSE
+CFLAGS = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
+!ENDIF
+
 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
 OUTDIR = ..\obj\
@ -111,9 +118,7 @@ LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME).lib
 LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME).lib
 LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME).lib
 !ELSE IF "$(DLLBUILD)" == "TRUE"
-DLLC   = webp_dll.c
 DLLINC = webp_dll.h
-DLL_OBJS = $(DIROBJ)\$(DLLC:.c=.obj)
 CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
 LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME)_dll.lib
 LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME)_dll.lib
@ -142,6 +147,8 @@ CFGSET = TRUE
 !MESSAGE .                                  features enabled.
 !MESSAGE - (empty)                        - build libwebp-based targets for CFG
 !MESSAGE - all                            - build (de)mux-based targets for CFG
+!MESSAGE - gif2webp                       - requires libgif & >= VS2013
+!MESSAGE - anim_diff                      - requires libgif & >= VS2013
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
 !MESSAGE   'legacy' will produce a Windows 2000 compatible library.
@ -174,34 +181,67 @@ DEC_OBJS = \
    $(DIROBJ)\dec\webp.obj \

 DEMUX_OBJS = \
+    $(DIROBJ)\demux\anim_decode.obj \
    $(DIROBJ)\demux\demux.obj \

 DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\alpha_processing.obj \
+    $(DIROBJ)\dsp\alpha_processing_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\alpha_processing_sse2.obj \
+    $(DIROBJ)\dsp\alpha_processing_sse41.obj \
    $(DIROBJ)\dsp\cpu.obj \
    $(DIROBJ)\dsp\dec.obj \
    $(DIROBJ)\dsp\dec_clip_tables.obj \
    $(DIROBJ)\dsp\dec_mips32.obj \
+    $(DIROBJ)\dsp\dec_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
+    $(DIROBJ)\dsp\dec_sse41.obj \
+    $(DIROBJ)\dsp\filters.obj \
+    $(DIROBJ)\dsp\filters_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\filters_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
-    $(DIROBJ)\dsp\lossless_mips32.obj \
+    $(DIROBJ)\dsp\lossless_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
+    $(DIROBJ)\dsp\rescaler.obj \
+    $(DIROBJ)\dsp\rescaler_mips32.obj \
+    $(DIROBJ)\dsp\rescaler_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\rescaler_neon.obj \
+    $(DIROBJ)\dsp\rescaler_sse2.obj \
    $(DIROBJ)\dsp\upsampling.obj \
+    $(DIROBJ)\dsp\upsampling_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\upsampling_neon.obj \
    $(DIROBJ)\dsp\upsampling_sse2.obj \
    $(DIROBJ)\dsp\yuv.obj \
    $(DIROBJ)\dsp\yuv_mips32.obj \
+    $(DIROBJ)\dsp\yuv_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\yuv_sse2.obj \

 DSP_ENC_OBJS = \
+    $(DIROBJ)\dsp\argb.obj \
+    $(DIROBJ)\dsp\argb_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\argb_sse2.obj \
+    $(DIROBJ)\dsp\cost.obj \
+    $(DIROBJ)\dsp\cost_mips32.obj \
+    $(DIROBJ)\dsp\cost_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\cost_sse2.obj \
    $(DIROBJ)\dsp\enc.obj \
    $(DIROBJ)\dsp\enc_avx2.obj \
    $(DIROBJ)\dsp\enc_mips32.obj \
+    $(DIROBJ)\dsp\enc_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\enc_neon.obj \
    $(DIROBJ)\dsp\enc_sse2.obj \
+    $(DIROBJ)\dsp\enc_sse41.obj \
+    $(DIROBJ)\dsp\lossless_enc.obj \
+    $(DIROBJ)\dsp\lossless_enc_mips32.obj \
+    $(DIROBJ)\dsp\lossless_enc_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\lossless_enc_neon.obj \
+    $(DIROBJ)\dsp\lossless_enc_sse2.obj \
+    $(DIROBJ)\dsp\lossless_enc_sse41.obj \
+
+EX_ANIM_UTIL_OBJS = \
+    $(DIROBJ)\examples\anim_util.obj \

 EX_FORMAT_DEC_OBJS = \
    $(DIROBJ)\examples\jpegdec.obj \
@ -211,6 +251,9 @@ EX_FORMAT_DEC_OBJS = \
    $(DIROBJ)\examples\webpdec.obj \
    $(DIROBJ)\examples\wicdec.obj \

+EX_GIF_DEC_OBJS = \
+    $(DIROBJ)\examples\gifdec.obj \
+
 EX_UTIL_OBJS = \
    $(DIROBJ)\examples\example_util.obj \

@ -220,10 +263,12 @@ ENC_OBJS = \
    $(DIROBJ)\enc\backward_references.obj \
    $(DIROBJ)\enc\config.obj \
    $(DIROBJ)\enc\cost.obj \
+    $(DIROBJ)\enc\delta_palettization.obj \
    $(DIROBJ)\enc\filter.obj \
    $(DIROBJ)\enc\frame.obj \
    $(DIROBJ)\enc\histogram.obj \
    $(DIROBJ)\enc\iterator.obj \
+    $(DIROBJ)\enc\near_lossless.obj \
    $(DIROBJ)\enc\picture.obj \
    $(DIROBJ)\enc\picture_csp.obj \
    $(DIROBJ)\enc\picture_psnr.obj \
@ -237,6 +282,7 @@ ENC_OBJS = \
    $(DIROBJ)\enc\webpenc.obj \

 MUX_OBJS = \
+    $(DIROBJ)\mux\anim_encode.obj \
    $(DIROBJ)\mux\muxedit.obj \
    $(DIROBJ)\mux\muxinternal.obj \
    $(DIROBJ)\mux\muxread.obj \
@ -264,19 +310,34 @@ LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
 LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)

 OUT_LIBS = $(LIBWEBPDECODER) $(LIBWEBP)
+!IF "$(ARCH)" == "ARM"
+ex: $(OUT_LIBS)
+all: ex
+!ELSE
 OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
 EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe

 ex: $(OUT_LIBS) $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
+# NB: gif2webp.exe and anim_diff.exe are excluded from 'all' as libgif requires
+# C99 support which is only available from VS2013 onward.
+gif2webp: $(DIRBIN)\gif2webp.exe
+anim_diff: $(DIRBIN)\anim_diff.exe
+
+$(DIRBIN)\anim_diff.exe: $(DIROBJ)\examples\anim_diff.obj $(EX_ANIM_UTIL_OBJS)
+$(DIRBIN)\anim_diff.exe: $(EX_UTIL_OBJS)
+$(DIRBIN)\anim_diff.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(EX_FORMAT_DEC_OBJS)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj
+$(DIRBIN)\gif2webp.exe: $(DIROBJ)\examples\gif2webp.obj $(EX_GIF_DEC_OBJS)
+$(DIRBIN)\gif2webp.exe: $(EX_UTIL_OBJS) $(LIBWEBPMUX) $(LIBWEBP)
 $(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj
 $(DIRBIN)\vwebp.exe: $(EX_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(LIBWEBP)
 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
 $(EX_UTIL_OBJS) $(EX_FORMAT_DEC_OBJS): $(OUTPUT_DIRS)
+!ENDIF  # ARCH == ARM

 experimental:
 	$(MAKE) /f Makefile.vc \
@ -292,7 +353,7 @@ $(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): $(OUTPUT_DIRS)

 !IF "$(DLLBUILD)" == "TRUE"
 $(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): \
-    $(DIROBJ)\$(DLLINC) $(DIROBJ)\$(DLLC)
+    $(DIROBJ)\$(DLLINC)

 {$(DIROBJ)}.c{$(DIROBJ)}.obj:
 	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$@  $<
@ -305,7 +366,7 @@ $(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
 	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y

 clean::
-	@-erase /s $(DIROBJ)\$(DLLC) $(DIROBJ)\$(DLLINC) 2> NUL
+	@-erase /s $(DIROBJ)\$(DLLINC) 2> NUL
 !ELSE
 $(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
 	$(LNKLIB) /out:$@ $**
@ -322,22 +383,24 @@ $(DIROBJ)\$(DLLINC):
 	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
 	@echo #endif  /* WEBP_DLL_H_ */ >> $@

-# expose a WebPFree() function for use in managed code
-$(DIROBJ)\$(DLLC): $(DIROBJ)\$(DLLINC)
-	@echo #include ^<stdlib.h^> > $@
-	@echo #include "webp_dll.h" >> $@
-	@echo // This function should be used in place of free() for memory >> $@
-	@echo // returned by the WebP API. >> $@
-	@echo WEBP_EXTERN(void) WebPFree(void* ptr) { >> $@
-	@echo   free(ptr); >> $@
-	@echo } >> $@
-
 .SUFFIXES: .c .obj .res .exe
 # File-specific flag builds. Note batch rules take precedence over wildcards,
 # so for now name each file individually.
 $(DIROBJ)\dsp\enc_avx2.obj: src\dsp\enc_avx2.c
 	$(CC) $(CFLAGS) $(AVX2_FLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ \
 	  src\dsp\$(@B).c
+$(DIROBJ)\examples\anim_diff.obj: examples\anim_diff.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\anim_util.obj: examples\anim_util.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\gif2webp.obj: examples\gif2webp.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\gifdec.obj: examples\gifdec.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
 # Batch rules
 {examples}.c{$(DIROBJ)\examples}.obj::
 	$(CC) $(CFLAGS) /Fd$(DIROBJ)\examples\ /Fo$(DIROBJ)\examples\ $<
--- a/33
+++ b/33
@ -1,3 +1,33 @@
+- 12/17/2015: version 0.5.0
+  * miscellaneous bug & build fixes (issues #234, #258, #274, #275, #278)
+  * encoder & decoder speed-ups on x86/ARM/MIPS for lossy & lossless
+    - note! YUV->RGB conversion was sped-up, but the results will be slightly
+      different from previous releases
+  * various lossless encoder improvements
+  * gif2webp improvements, -min_size option added
+  * tools fully support input from stdin and output to stdout (issue #168)
+  * New WebPAnimEncoder API for creating animations
+  * New WebPAnimDecoder API for decoding animations
+  * other API changes:
+    - libwebp:
+      WebPPictureSmartARGBToYUVA() (-pre 4 in cwebp)
+      WebPConfig::exact (-exact in cwebp; -alpha_cleanup is now the default)
+      WebPConfig::near_lossless (-near_lossless in cwebp)
+      WebPFree() (free'ing webp allocated memory in other languages)
+      WebPConfigLosslessPreset()
+      WebPMemoryWriterClear()
+    - libwebpdemux: removed experimental fragment related fields and functions
+    - libwebpmux: WebPMuxSetCanvasSize()
+  * new libwebpextras library with some uncommon import functions:
+    WebPImportGray/WebPImportRGB565/WebPImportRGB4444
+
+- 10/15/15: version 0.4.4
+  This is a binary compatible release.
+  * rescaling out-of-bounds read fix (issue #254)
+  * various build fixes and improvements (issues #253, #259, #262, #267, #268)
+  * container documentation update
+  * gif2webp transparency fix (issue #245)
+
 - 3/3/15: version 0.4.3
  This is a binary compatible release.
  * Android / gcc / iOS / MSVS build fixes and improvements
@ -89,7 +119,8 @@
 - 9/19/11: version 0.1.3
  * Advanced decoding APIs.
  * On-the-fly cropping and rescaling of images.
-  * SSE2 instructions for decoding performance optimizations on x86 based platforms.
+  * SSE2 instructions for decoding performance optimizations on x86 based
+    platforms.
  * Support Multi-threaded decoding.
  * 40% improvement in Decoding performance.
  * Add support for RGB565, RGBA4444 & ARGB image colorspace.
--- a/2
+++ b/2
@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
--- a/52
+++ b/52
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.3
+               \____/____/\_____/_____/____/v0.5.0

 Description:
 ============
@ -15,7 +15,8 @@ as well as the command line tools 'cwebp' and 'dwebp'.

 See http://developers.google.com/speed/webp

-Latest sources are available from http://www.webmproject.org/code/
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp

 It is released under the same license as the WebM project.
 See http://www.webmproject.org/license/software/ or the
@ -53,6 +54,12 @@ Please refer to makefile.unix for additional details and customizations.

 Using autoconf tools:
 ---------------------
+Prerequisites:
+A compiler (e.g., gcc), make, autoconf, automake, libtool.
+On a Debian-like system the following should install everything you need for a
+minimal build:
+$ sudo apt-get install gcc make autoconf automake libtool
+
 When building from git sources, you will need to run autogen.sh to generate the
 configure script.

@ -152,6 +159,8 @@ Options:
                            default, photo, picture,
                            drawing, icon, text
     -preset must come first, as it overwrites other parameters
+  -z <int> ............... activates lossless preset with given
+                           level in [0:fast, ..., 9:slowest]

  -m <int> ............... compression method (0=fast, 6=slowest)
  -segments <int> ........ number of segments to use (1..4)
@ -179,13 +188,15 @@ Options:
  -alpha_method <int> .... transparency-compression method (0..1)
  -alpha_filter <string> . predictive filtering for alpha plane,
                           one of: none, fast (default) or best
-  -alpha_cleanup ......... clean RGB values in transparent area
+  -exact ................. preserve RGB values in transparent area
  -blend_alpha <hex> ..... blend colors against background color
                           expressed as RGB values written in
                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
                           green=0xe0 and blue=0xd0
  -noalpha ............... discard any transparency information
  -lossless .............. encode image losslessly
+  -near_lossless <int> ... use near-lossless image
+                           preprocessing (0..100=off)
  -hint <string> ......... specify image characteristics hint,
                           one of: photo, picture or graph

@ -268,13 +279,16 @@ Use following options to convert into alternate image formats:
  -nofilter .... disable in-loop filtering
  -nodither .... disable dithering
  -dither <d> .. dithering strength (in 0..100)
+  -alpha_dither  use alpha-plane dithering if needed
  -mt .......... use multi-threading
  -crop <x> <y> <w> <h> ... crop output with the given rectangle
-  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -resize <w> <h> ......... scale the output (*after* any cropping)
+  -flip ........ flip the output vertically
  -alpha ....... only save the alpha plane
  -incremental . use incremental decoding (useful for tests)
  -h     ....... this help message
  -v     ....... verbose (e.g. print encoding/decoding times)
+  -quiet ....... quiet mode, don't print anything
  -noasm ....... disable all assembly optimizations

 Visualization tool:
@ -294,6 +308,7 @@ Options are:
  -nofancy ..... don't use the fancy YUV420 upscaler
  -nofilter .... disable in-loop filtering
  -dither <int>  dithering strength (0..100), default=50
+  -noalphadither disable alpha plane dithering
  -mt .......... use multi-threading
  -info ........ print info
  -h     ....... this help message
@ -344,6 +359,10 @@ Options:
                           or lossless compression heuristically
  -q <float> ............. quality factor (0:small..100:big)
  -m <int> ............... compression method (0=fast, 6=slowest)
+  -min_size .............. minimize output size (default:off)
+                           lossless compression by default; can be
+                           combined with -q, -m, -lossy or -mixed
+                           options
  -kmin <int> ............ min distance between key frames
  -kmax <int> ............ max distance between key frames
  -f <int> ............... filter strength (0=off..100)
@ -366,6 +385,29 @@ or using autoconf:
 $ ./configure --enable-everything
 $ make

+Comparison of animated images:
+==============================
+Test utility anim_diff under examples/ can be used to compare two animated
+images (each can be GIF or WebP).
+
+Usage: anim_diff <image1> <image2> [options]
+
+Options:
+  -dump_frames <folder> dump decoded frames in PAM format
+  -min_psnr <float> ... minimum per-frame PSNR
+  -raw_comparison ..... if this flag is not used, RGB is
+                        premultiplied before comparison
+
+Building:
+---------
+With the libgif development files and a C++ compiler installed, anim_diff can
+be built using makefile.unix:
+$ make -f makefile.unix examples/anim_diff
+
+or using autoconf:
+$ ./configure --enable-everything
+$ make
+
 Encoding API:
 =============

@ -596,7 +638,7 @@ Bugs:
 =====

 Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/README.mux
+++ b/README.mux
@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.2
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.3.0


 Description:
@ -133,7 +133,7 @@ WebP files. This API currently supports reading of XMP/EXIF metadata, ICC
 profile and animated images. Other features may be added in subsequent
 releases.

-Code Example: Demuxing WebP data to extract all the frames, ICC profile
+Code example: Demuxing WebP data to extract all the frames, ICC profile
 and EXIF/XMP metadata.

  WebPDemuxer* demux = WebPDemux(&webp_data);
@ -170,12 +170,36 @@ and EXIF/XMP metadata.
 For a detailed Demux API reference, please refer to the header file
 (src/webp/demux.h).

+AnimEncoder API:
+================
+The AnimEncoder API can be used to create animated WebP images.
+
+Code example:
+
+  WebPAnimEncoderOptions enc_options;
+  WebPAnimEncoderOptionsInit(&enc_options);
+  // ... (Tune 'enc_options' as needed).
+  WebPAnimEncoder* enc = WebPAnimEncoderNew(width, height, &enc_options);
+  while(<there are more frames>) {
+    WebPConfig config;
+    WebPConfigInit(&config);
+    // ... (Tune 'config' as needed).
+    WebPAnimEncoderAdd(enc, frame, duration, &config);
+  }
+  WebPAnimEncoderAssemble(enc, webp_data);
+  WebPAnimEncoderDelete(enc);
+  // ... (Write the 'webp_data' to a file, or re-mux it further).
+
+
+For a detailed AnimEncoder API reference, please refer to the header file
+(src/webp/mux.h).
+

 Bugs:
 =====

 Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/configure.ac
+++ b/configure.ac
@ -1,5 +1,5 @@
-AC_INIT([libwebp], [0.4.3],
-        [http://code.google.com/p/webp/issues],,
+AC_INIT([libwebp], [0.5.0],
+        [https://bugs.chromium.org/p/webp],,
        [http://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
 AC_PREREQ([2.60])
@ -28,8 +28,21 @@ AC_ARG_ENABLE([everything],
                              disabled with --disable-target]),
              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
+               SET_IF_UNSET([enable_libwebpextras], [$enableval])
               SET_IF_UNSET([enable_libwebpmux], [$enableval])])

+dnl === If --enable-asserts is not defined, define NDEBUG
+
+AC_MSG_CHECKING(whether asserts are enabled)
+AC_ARG_ENABLE([asserts],
+              AS_HELP_STRING([--enable-asserts],
+                             [Enable assert checks]))
+if test "x${enable_asserts-no}" = "xno"; then
+  AM_CPPFLAGS="${AM_CPPFLAGS} -DNDEBUG"
+fi
+AC_MSG_RESULT(${enable_asserts-no})
+AC_SUBST([AM_CPPFLAGS])
+
 AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=DIR],
            [Path to the pkgconfig directory @<:@LIBDIR/pkgconfig@:>@]),
            [pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
@ -51,6 +64,7 @@ AC_DEFUN([TEST_AND_ADD_CFLAGS],
                            [$1="${$1} $2"],
                            [AC_MSG_RESULT([no])])
          CFLAGS="$SAVED_CFLAGS"])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-fvisibility=hidden])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
@ -60,6 +74,8 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wvla])
@ -76,36 +92,91 @@ AS_IF([test "$GCC" = "yes" ], [
       esac
       AS_IF([test "$gcc_wht_bug" = "yes"], [
              TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-frename-registers])])])
+# Use -flax-vector-conversions, if available, when building intrinsics with
+# older versions of gcc. The flag appeared in 4.3.x, but if backported, and
+# -fno-lax-vector-conversions is set, errors may occur with the intrinsics
+# files along with the older system includes, e.g., emmintrin.h.
+# Originally observed with cc (GCC) 4.2.1 20070831 patched [FreeBSD] (9.3).
+# https://bugs.chromium.org/p/webp/issues/detail?id=274
+AS_IF([test "$GCC" = "yes" ], [
+       case "$host_cpu" in
+         amd64|i?86|x86_64)
+           AC_COMPILE_IFELSE(
+             dnl only check for -flax-vector-conversions with older gcc, skip
+             dnl clang as it reports itself as 4.2.1, but the flag isn't needed.
+             [AC_LANG_SOURCE([#if !defined(__clang__) && defined(__GNUC__) && \
+                                  ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x403
+                              #error old gcc
+                              #endif
+                              int main(void) { return 0; }
+                             ])],,
+              [TEST_AND_ADD_CFLAGS([INTRINSICS_CFLAGS],
+                                   [-flax-vector-conversions])])
+           ;;
+       esac])
 AC_SUBST([AM_CFLAGS])

 dnl === Check for machine specific flags
-TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
-AS_IF([test -n "$AVX2_FLAGS"], [
-  SAVED_CFLAGS=$CFLAGS
-  CFLAGS="$CFLAGS $AVX2_FLAGS"
-  AC_CHECK_HEADER([immintrin.h],
-                  [AC_DEFINE(WEBP_HAVE_AVX2, [1],
-                   [Set to 1 if AVX2 is supported])],
-                  [AVX2_FLAGS=""],
-                  dnl it's illegal to directly include avx2intrin.h, but it's
-                  dnl included conditionally in immintrin.h, tricky!
-                  [#ifndef __AVX2__
-                   #error avx2 is not enabled
-                   #endif
-                  ])
-  CFLAGS=$SAVED_CFLAGS])
-AC_SUBST([AVX2_FLAGS])
+AC_ARG_ENABLE([avx2],
+              AS_HELP_STRING([--disable-avx2],
+                             [Disable detection of AVX2 support
+                              @<:@default=auto@:>@]))

-TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2])
-AS_IF([test -n "$SSE2_FLAGS"], [
-  SAVED_CFLAGS=$CFLAGS
-  CFLAGS="$CFLAGS $SSE2_FLAGS"
-  AC_CHECK_HEADER([emmintrin.h],
-                  [AC_DEFINE(WEBP_HAVE_SSE2, [1],
-                   [Set to 1 if SSE2 is supported])],
-                  [SSE2_FLAGS=""])
-  CFLAGS=$SAVED_CFLAGS])
-AC_SUBST([SSE2_FLAGS])
+AS_IF([test "x$enable_avx2" != "xno" -a "x$enable_sse4_1" != "xno" \
+         -a "x$enable_sse2" != "xno"], [
+  AVX2_CFLAGS="$INTRINSICS_CFLAGS $AVX2_FLAGS"
+  TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
+  AS_IF([test -n "$AVX2_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $AVX2_FLAGS"
+    AC_CHECK_HEADER([immintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_AVX2, [1],
+                     [Set to 1 if AVX2 is supported])],
+                    [AVX2_FLAGS=""],
+                    dnl it's illegal to directly include avx2intrin.h, but it's
+                    dnl included conditionally in immintrin.h, tricky!
+                    [#ifndef __AVX2__
+                     #error avx2 is not enabled
+                     #endif
+                    ])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([AVX2_FLAGS])])
+
+AC_ARG_ENABLE([sse4.1],
+              AS_HELP_STRING([--disable-sse4.1],
+                             [Disable detection of SSE4.1 support
+                              @<:@default=auto@:>@]))
+
+AS_IF([test "x$enable_sse4_1" != "xno" -a "x$enable_sse2" != "xno"], [
+  SSE41_FLAGS="$INTRINSICS_CFLAGS $SSE41_FLAGS"
+  TEST_AND_ADD_CFLAGS([SSE41_FLAGS], [-msse4.1])
+  AS_IF([test -n "$SSE41_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $SSE41_FLAGS"
+    AC_CHECK_HEADER([smmintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_SSE41, [1],
+                     [Set to 1 if SSE4.1 is supported])],
+                    [SSE41_FLAGS=""])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([SSE41_FLAGS])])
+
+AC_ARG_ENABLE([sse2],
+              AS_HELP_STRING([--disable-sse2],
+                             [Disable detection of SSE2 support
+                              @<:@default=auto@:>@]))
+
+AS_IF([test "x$enable_sse2" != "xno"], [
+  SSE2_FLAGS="$INTRINSICS_CFLAGS $SSE2_FLAGS"
+  TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2])
+  AS_IF([test -n "$SSE2_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $SSE2_FLAGS"
+    AC_CHECK_HEADER([emmintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_SSE2, [1],
+                     [Set to 1 if SSE2 is supported])],
+                    [SSE2_FLAGS=""])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([SSE2_FLAGS])])

 dnl === CLEAR_LIBVARS([var_pfx])
 dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
@ -405,11 +476,17 @@ AS_IF([test "x$enable_gif" != "xno"], [
  )
  LIBCHECK_EPILOGUE([GIF])

+  if test "$gif_support" = "yes" -a \
+          "$enable_libwebpdemux" = "yes"; then
+    build_animdiff=yes
+  fi
+
  if test "$gif_support" = "yes" -a \
          "$enable_libwebpmux" = "yes"; then
    build_gif2webp=yes
  fi
 ])
+AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_animdiff}" = "yes"])
 AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 dnl === check for WIC support ===
@ -527,6 +604,14 @@ AC_ARG_ENABLE([libwebpdecoder],
 AC_MSG_RESULT(${enable_libwebpdecoder-no})
 AM_CONDITIONAL([BUILD_LIBWEBPDECODER], [test "$enable_libwebpdecoder" = "yes"])

+dnl === Check whether libwebpextras should be built
+AC_MSG_CHECKING(whether libwebpextras is to be built)
+AC_ARG_ENABLE([libwebpextras],
+              AS_HELP_STRING([--enable-libwebpextras],
+                             [Build libwebpextras @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpextras-no})
+AM_CONDITIONAL([WANT_EXTRAS], [test "$enable_libwebpextras" = "yes"])
+
 dnl =========================

 AC_CONFIG_MACRO_DIR([m4])
@ -535,9 +620,10 @@ AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile src/dec/Makefile \
                 src/enc/Makefile src/dsp/Makefile \
                 src/demux/Makefile src/mux/Makefile \
-                 src/utils/Makefile \
+                 src/utils/Makefile src/extras/Makefile \
                 src/libwebp.pc src/libwebpdecoder.pc \
-                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc])
+                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc \
+                 src/extras/libwebpextras.pc])


 AC_OUTPUT
@ -553,6 +639,7 @@ libwebp: yes
 libwebpdecoder: ${enable_libwebpdecoder-no}
 libwebpdemux: ${enable_libwebpdemux-no}
 libwebpmux: ${enable_libwebpmux-no}
+libwebpextras: ${enable_libwebpextras-no}

 Tools:
 cwebp : yes
@ -568,6 +655,7 @@ dwebp : yes
  PNG  : ${png_support-no}
  WIC  : ${wic_support-no}
 GIF support : ${gif_support-no}
+anim_diff   : ${build_animdiff-no}
 gif2webp    : ${build_gif2webp-no}
 webpmux     : ${enable_libwebpmux-no}
 vwebp       : ${build_vwebp-no}
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -271,9 +271,15 @@ An extended format file consists of:

  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_

-For a _still image_, the _image data_ consists of a single frame, whereas for
-an _animated image_, it consists of multiple frames. More details about frames
-can be found in the [Animation](#animation) section.
+For a _still image_, the _image data_ consists of a single frame, which is made
+up of:
+
+  * An optional [alpha subchunk](#alpha).
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l).
+
+For an _animated image_, the _image data_ consists of multiple frames. More
+details about frames can be found in the [Animation](#animation) section.

 All chunks SHOULD be placed in the same order as listed above. If a chunk
 appears in the wrong place, the file is invalid, but readers MAY parse the
@ -809,7 +815,7 @@ RIFF/WEBP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 [vp8spec]:  http://tools.ietf.org/html/rfc6386
-[webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[webpllspec]: https://chromium.googlesource.com/webm/libwebp/+/master/doc/webp-lossless-bitstream-spec.txt
 [iccspec]: http://www.color.org/icc_specs2.xalter
 [metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
 [rfc 1166]: http://tools.ietf.org/html/rfc1166
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,4 +1,4 @@
-AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_srcdir)/src
+AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
 if BUILD_VWEBP
@ -16,6 +16,16 @@ noinst_LTLIBRARIES = libexampleutil.la

 libexampleutil_la_SOURCES = example_util.c example_util.h stopwatch.h

+if BUILD_ANIMDIFF
+  noinst_PROGRAMS = anim_diff
+endif
+
+anim_diff_SOURCES = anim_diff.c anim_util.c anim_util.h
+anim_diff_CPPFLAGS = $(AM_CPPFLAGS) $(GIF_INCLUDES)
+anim_diff_LDADD  = ../src/demux/libwebpdemux.la
+anim_diff_LDADD += libexampleutil.la
+anim_diff_LDADD += $(GIF_LIBS) -lm
+
 dwebp_SOURCES = dwebp.c stopwatch.h
 dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
@ -32,7 +42,7 @@ cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 cwebp_LDADD  = libexampleutil.la ../src/libwebp.la
 cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

-gif2webp_SOURCES = gif2webp.c gif2webp_util.c
+gif2webp_SOURCES = gif2webp.c gifdec.c gifdec.h
 gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
 gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
 gif2webp_LDADD += $(GIF_LIBS)
@ -46,9 +56,11 @@ vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
 vwebp_LDADD = libexampleutil.la ../src/demux/libwebpdemux.la $(GL_LIBS)

 if BUILD_LIBWEBPDECODER
+  anim_diff_LDADD += ../src/libwebpdecoder.la
  dwebp_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
 else
+  anim_diff_LDADD += ../src/libwebp.la
  dwebp_LDADD += ../src/libwebp.la
  vwebp_LDADD += ../src/libwebp.la
 endif
--- a/examples/anim_diff.c
+++ b/examples/anim_diff.c
@ -0,0 +1,220 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Checks if given pair of animated GIF/WebP images are identical:
+// That is: their reconstructed canvases match pixel-by-pixel and their other
+// animation properties (loop count etc) also match.
+//
+// example: anim_diff foo.gif bar.webp
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>  // for 'strtod'.
+#include <string.h>  // for 'strcmp'.
+
+#include "./anim_util.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+// Returns true if 'a + b' will overflow.
+static int AdditionWillOverflow(int a, int b) {
+  return (b > 0) && (a > INT_MAX - b);
+}
+
+// Minimize number of frames by combining successive frames that have exact same
+// ARGB data into a single longer duration frame.
+static void MinimizeAnimationFrames(AnimatedImage* const img) {
+  uint32_t i;
+  for (i = 1; i < img->num_frames; ++i) {
+    DecodedFrame* const frame1 = &img->frames[i - 1];
+    DecodedFrame* const frame2 = &img->frames[i];
+    const uint8_t* const rgba1 = frame1->rgba;
+    const uint8_t* const rgba2 = frame2->rgba;
+    // If merging frames will result in integer overflow for 'duration',
+    // skip merging.
+    if (AdditionWillOverflow(frame1->duration, frame2->duration)) continue;
+    if (!memcmp(rgba1, rgba2, img->canvas_width * 4 * img->canvas_height)) {
+      // Merge 'i+1'th frame into 'i'th frame.
+      frame1->duration += frame2->duration;
+      if (i + 1 < img->num_frames) {
+        memmove(&img->frames[i], &img->frames[i + 1],
+                (img->num_frames - i - 1) * sizeof(*img->frames));
+      }
+      --img->num_frames;
+      --i;
+    }
+  }
+}
+
+static int CompareValues(uint32_t a, uint32_t b, const char* output_str) {
+  if (a != b) {
+    fprintf(stderr, "%s: %d vs %d\n", output_str, a, b);
+    return 0;
+  }
+  return 1;
+}
+
+// Note: As long as frame durations and reconstructed frames are identical, it
+// is OK for other aspects like offsets, dispose/blend method to vary.
+static int CompareAnimatedImagePair(const AnimatedImage* const img1,
+                                    const AnimatedImage* const img2,
+                                    int premultiply,
+                                    double min_psnr) {
+  int ok = 1;
+  const int is_multi_frame_image = (img1->num_frames > 1);
+  uint32_t i;
+
+  ok = CompareValues(img1->canvas_width, img2->canvas_width,
+                     "Canvas width mismatch") && ok;
+  ok = CompareValues(img1->canvas_height, img2->canvas_height,
+                     "Canvas height mismatch") && ok;
+  ok = CompareValues(img1->num_frames, img2->num_frames,
+                     "Frame count mismatch") && ok;
+  if (!ok) return 0;  // These are fatal failures, can't proceed.
+
+  if (is_multi_frame_image) {  // Checks relevant for multi-frame images only.
+    ok = CompareValues(img1->loop_count, img2->loop_count,
+                       "Loop count mismatch") && ok;
+    ok = CompareValues(img1->bgcolor, img2->bgcolor,
+                       "Background color mismatch") && ok;
+  }
+
+  for (i = 0; i < img1->num_frames; ++i) {
+    // Pixel-by-pixel comparison.
+    const uint8_t* const rgba1 = img1->frames[i].rgba;
+    const uint8_t* const rgba2 = img2->frames[i].rgba;
+    int max_diff;
+    double psnr;
+    if (is_multi_frame_image) {  // Check relevant for multi-frame images only.
+      const char format[] = "Frame #%d, duration mismatch";
+      char tmp[sizeof(format) + 8];
+      ok = ok && (snprintf(tmp, sizeof(tmp), format, i) >= 0);
+      ok = ok && CompareValues(img1->frames[i].duration,
+                               img2->frames[i].duration, tmp);
+    }
+    GetDiffAndPSNR(rgba1, rgba2, img1->canvas_width, img1->canvas_height,
+                   premultiply, &max_diff, &psnr);
+    if (min_psnr > 0.) {
+      if (psnr < min_psnr) {
+        fprintf(stderr, "Frame #%d, psnr = %.2lf (min_psnr = %f)\n", i,
+                psnr, min_psnr);
+        ok = 0;
+      }
+    } else {
+      if (max_diff != 0) {
+        fprintf(stderr, "Frame #%d, max pixel diff: %d\n", i, max_diff);
+        ok = 0;
+      }
+    }
+  }
+  return ok;
+}
+
+static void Help(void) {
+  printf("Usage: anim_diff <image1> <image2> [options]\n");
+  printf("\nOptions:\n");
+  printf("  -dump_frames <folder> dump decoded frames in PAM format\n");
+  printf("  -min_psnr <float> ... minimum per-frame PSNR\n");
+  printf("  -raw_comparison ..... if this flag is not used, RGB is\n");
+  printf("                        premultiplied before comparison\n");
+}
+
+int main(int argc, const char* argv[]) {
+  int return_code = -1;
+  int dump_frames = 0;
+  const char* dump_folder = NULL;
+  double min_psnr = 0.;
+  int got_input1 = 0;
+  int got_input2 = 0;
+  int premultiply = 1;
+  int i, c;
+  const char* files[2] = { NULL, NULL };
+  AnimatedImage images[2];
+
+  if (argc < 3) {
+    Help();
+    return -1;
+  }
+
+  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
+    if (!strcmp(argv[c], "-dump_frames")) {
+      if (c < argc - 1) {
+        dump_frames = 1;
+        dump_folder = argv[++c];
+      } else {
+        parse_error = 1;
+      }
+    } else if (!strcmp(argv[c], "-min_psnr")) {
+      if (c < argc - 1) {
+        const char* const v = argv[++c];
+        char* end = NULL;
+        const double d = strtod(v, &end);
+        if (end == v) {
+          parse_error = 1;
+          fprintf(stderr, "Error! '%s' is not a floating point number.\n", v);
+        }
+        min_psnr = d;
+      } else {
+        parse_error = 1;
+      }
+    } else if (!strcmp(argv[c], "-raw_comparison")) {
+      premultiply = 0;
+    } else {
+      if (!got_input1) {
+        files[0] = argv[c];
+        got_input1 = 1;
+      } else if (!got_input2) {
+        files[1] = argv[c];
+        got_input2 = 1;
+      } else {
+        parse_error = 1;
+      }
+    }
+    if (parse_error) {
+      Help();
+      return -1;
+    }
+  }
+  if (!got_input2) {
+    Help();
+    return -1;
+  }
+
+  if (dump_frames) {
+    printf("Dumping decoded frames in: %s\n", dump_folder);
+  }
+
+  memset(images, 0, sizeof(images));
+  for (i = 0; i < 2; ++i) {
+    printf("Decoding file: %s\n", files[i]);
+    if (!ReadAnimatedImage(files[i], &images[i], dump_frames, dump_folder)) {
+      fprintf(stderr, "Error decoding file: %s\n Aborting.\n", files[i]);
+      return_code = -2;
+      goto End;
+    } else {
+      MinimizeAnimationFrames(&images[i]);
+    }
+  }
+
+  if (!CompareAnimatedImagePair(&images[0], &images[1],
+                                premultiply, min_psnr)) {
+    fprintf(stderr, "\nFiles %s and %s differ.\n", files[0], files[1]);
+    return_code = -3;
+  } else {
+    printf("\nFiles %s and %s are identical.\n", files[0], files[1]);
+    return_code = 0;
+  }
+ End:
+  ClearAnimatedImage(&images[0]);
+  ClearAnimatedImage(&images[1]);
+  return return_code;
+}
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@ -0,0 +1,754 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for animated images
+
+#include "./anim_util.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef WEBP_HAVE_GIF
+#include <gif_lib.h>
+#endif
+#include "webp/format_constants.h"
+#include "webp/decode.h"
+#include "webp/demux.h"
+#include "./example_util.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+static const int kNumChannels = 4;
+
+// -----------------------------------------------------------------------------
+// Common utilities.
+
+// Returns true if the frame covers the full canvas.
+static int IsFullFrame(int width, int height,
+                       int canvas_width, int canvas_height) {
+  return (width == canvas_width && height == canvas_height);
+}
+
+static int AllocateFrames(AnimatedImage* const image, uint32_t num_frames) {
+  uint32_t i;
+  const size_t rgba_size =
+      image->canvas_width * kNumChannels * image->canvas_height;
+  uint8_t* const mem = (uint8_t*)malloc(num_frames * rgba_size * sizeof(*mem));
+  DecodedFrame* const frames =
+      (DecodedFrame*)malloc(num_frames * sizeof(*frames));
+
+  if (mem == NULL || frames == NULL) {
+    free(mem);
+    free(frames);
+    return 0;
+  }
+  free(image->raw_mem);
+  image->num_frames = num_frames;
+  image->frames = frames;
+  for (i = 0; i < num_frames; ++i) {
+    frames[i].rgba = mem + i * rgba_size;
+    frames[i].duration = 0;
+    frames[i].is_key_frame = 0;
+  }
+  image->raw_mem = mem;
+  return 1;
+}
+
+void ClearAnimatedImage(AnimatedImage* const image) {
+  if (image != NULL) {
+    free(image->raw_mem);
+    free(image->frames);
+    image->num_frames = 0;
+    image->frames = NULL;
+    image->raw_mem = NULL;
+  }
+}
+
+// Clear the canvas to transparent.
+static void ZeroFillCanvas(uint8_t* rgba,
+                           uint32_t canvas_width, uint32_t canvas_height) {
+  memset(rgba, 0, canvas_width * kNumChannels * canvas_height);
+}
+
+// Clear given frame rectangle to transparent.
+static void ZeroFillFrameRect(uint8_t* rgba, int rgba_stride, int x_offset,
+                              int y_offset, int width, int height) {
+  int j;
+  assert(width * kNumChannels <= rgba_stride);
+  rgba += y_offset * rgba_stride + x_offset * kNumChannels;
+  for (j = 0; j < height; ++j) {
+    memset(rgba, 0, width * kNumChannels);
+    rgba += rgba_stride;
+  }
+}
+
+// Copy width * height pixels from 'src' to 'dst'.
+static void CopyCanvas(const uint8_t* src, uint8_t* dst,
+                       uint32_t width, uint32_t height) {
+  assert(src != NULL && dst != NULL);
+  memcpy(dst, src, width * kNumChannels * height);
+}
+
+// Copy pixels in the given rectangle from 'src' to 'dst' honoring the 'stride'.
+static void CopyFrameRectangle(const uint8_t* src, uint8_t* dst, int stride,
+                               int x_offset, int y_offset,
+                               int width, int height) {
+  int j;
+  const int width_in_bytes = width * kNumChannels;
+  const size_t offset = y_offset * stride + x_offset * kNumChannels;
+  assert(width_in_bytes <= stride);
+  src += offset;
+  dst += offset;
+  for (j = 0; j < height; ++j) {
+    memcpy(dst, src, width_in_bytes);
+    src += stride;
+    dst += stride;
+  }
+}
+
+// Canonicalize all transparent pixels to transparent black to aid comparison.
+static void CleanupTransparentPixels(uint32_t* rgba,
+                                     uint32_t width, uint32_t height) {
+  const uint32_t* const rgba_end = rgba + width * height;
+  while (rgba < rgba_end) {
+    const uint8_t alpha = (*rgba >> 24) & 0xff;
+    if (alpha == 0) {
+      *rgba = 0;
+    }
+    ++rgba;
+  }
+}
+
+// Dump frame to a PAM file. Returns true on success.
+static int DumpFrame(const char filename[], const char dump_folder[],
+                     uint32_t frame_num, const uint8_t rgba[],
+                     int canvas_width, int canvas_height) {
+  int ok = 0;
+  size_t max_len;
+  int y;
+  const char* base_name = NULL;
+  char* file_name = NULL;
+  FILE* f = NULL;
+
+  base_name = strrchr(filename, '/');
+  base_name = (base_name == NULL) ? filename : base_name + 1;
+  max_len = strlen(dump_folder) + 1 + strlen(base_name)
+          + strlen("_frame_") + strlen(".pam") + 8;
+  file_name = (char*)malloc(max_len * sizeof(*file_name));
+  if (file_name == NULL) goto End;
+
+  if (snprintf(file_name, max_len, "%s/%s_frame_%d.pam",
+               dump_folder, base_name, frame_num) < 0) {
+    fprintf(stderr, "Error while generating file name\n");
+    goto End;
+  }
+
+  f = fopen(file_name, "wb");
+  if (f == NULL) {
+    fprintf(stderr, "Error opening file for writing: %s\n", file_name);
+    ok = 0;
+    goto End;
+  }
+  if (fprintf(f, "P7\nWIDTH %d\nHEIGHT %d\n"
+              "DEPTH 4\nMAXVAL 255\nTUPLTYPE RGB_ALPHA\nENDHDR\n",
+              canvas_width, canvas_height) < 0) {
+    fprintf(stderr, "Write error for file %s\n", file_name);
+    goto End;
+  }
+  for (y = 0; y < canvas_height; ++y) {
+    if (fwrite((const char*)(rgba) + y * canvas_width * kNumChannels,
+               canvas_width * kNumChannels, 1, f) != 1) {
+      fprintf(stderr, "Error writing to file: %s\n", file_name);
+      goto End;
+    }
+  }
+  ok = 1;
+ End:
+  if (f != NULL) fclose(f);
+  free(file_name);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// WebP Decoding.
+
+// Returns true if this is a valid WebP bitstream.
+static int IsWebP(const WebPData* const webp_data) {
+  return (WebPGetInfo(webp_data->bytes, webp_data->size, NULL, NULL) != 0);
+}
+
+// Read animated WebP bitstream 'file_str' into 'AnimatedImage' struct.
+static int ReadAnimatedWebP(const char filename[],
+                            const WebPData* const webp_data,
+                            AnimatedImage* const image, int dump_frames,
+                            const char dump_folder[]) {
+  int ok = 0;
+  int dump_ok = 1;
+  uint32_t frame_index = 0;
+  int prev_frame_timestamp = 0;
+  WebPAnimDecoder* dec;
+  WebPAnimInfo anim_info;
+
+  memset(image, 0, sizeof(*image));
+
+  dec = WebPAnimDecoderNew(webp_data, NULL);
+  if (dec == NULL) {
+    fprintf(stderr, "Error parsing image: %s\n", filename);
+    goto End;
+  }
+
+  if (!WebPAnimDecoderGetInfo(dec, &anim_info)) {
+    fprintf(stderr, "Error getting global info about the animation\n");
+    goto End;
+  }
+
+  // Animation properties.
+  image->canvas_width = anim_info.canvas_width;
+  image->canvas_height = anim_info.canvas_height;
+  image->loop_count = anim_info.loop_count;
+  image->bgcolor = anim_info.bgcolor;
+
+  // Allocate frames.
+  if (!AllocateFrames(image, anim_info.frame_count)) return 0;
+
+  // Decode frames.
+  while (WebPAnimDecoderHasMoreFrames(dec)) {
+    DecodedFrame* curr_frame;
+    uint8_t* curr_rgba;
+    uint8_t* frame_rgba;
+    int timestamp;
+
+    if (!WebPAnimDecoderGetNext(dec, &frame_rgba, &timestamp)) {
+      fprintf(stderr, "Error decoding frame #%u\n", frame_index);
+      goto End;
+    }
+    curr_frame = &image->frames[frame_index];
+    curr_rgba = curr_frame->rgba;
+    curr_frame->duration = timestamp - prev_frame_timestamp;
+    curr_frame->is_key_frame = 0;  // Unused.
+    memcpy(curr_rgba, frame_rgba,
+           image->canvas_width * kNumChannels * image->canvas_height);
+
+    // Needed only because we may want to compare with GIF later.
+    CleanupTransparentPixels((uint32_t*)curr_rgba,
+                             image->canvas_width, image->canvas_height);
+
+    if (dump_frames && dump_ok) {
+      dump_ok = DumpFrame(filename, dump_folder, frame_index, curr_rgba,
+                          image->canvas_width, image->canvas_height);
+      if (!dump_ok) {  // Print error once, but continue decode loop.
+        fprintf(stderr, "Error dumping frames to %s\n", dump_folder);
+      }
+    }
+
+    ++frame_index;
+    prev_frame_timestamp = timestamp;
+  }
+  ok = dump_ok;
+
+ End:
+  WebPAnimDecoderDelete(dec);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// GIF Decoding.
+
+// Returns true if this is a valid GIF bitstream.
+static int IsGIF(const WebPData* const data) {
+  return data->size > GIF_STAMP_LEN &&
+         (!memcmp(GIF_STAMP, data->bytes, GIF_STAMP_LEN) ||
+          !memcmp(GIF87_STAMP, data->bytes, GIF_STAMP_LEN) ||
+          !memcmp(GIF89_STAMP, data->bytes, GIF_STAMP_LEN));
+}
+
+#ifdef WEBP_HAVE_GIF
+
+// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
+# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
+# define LOCAL_GIF_PREREQ(maj, min) \
+    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GIF_VERSION 0
+# define LOCAL_GIF_PREREQ(maj, min) 0
+#endif
+
+#if !LOCAL_GIF_PREREQ(5, 0)
+
+// Added in v5.0
+typedef struct {
+  int DisposalMode;
+#define DISPOSAL_UNSPECIFIED      0       // No disposal specified
+#define DISPOSE_DO_NOT            1       // Leave image in place
+#define DISPOSE_BACKGROUND        2       // Set area to background color
+#define DISPOSE_PREVIOUS          3       // Restore to previous content
+  int UserInputFlag;       // User confirmation required before disposal
+  int DelayTime;           // Pre-display delay in 0.01sec units
+  int TransparentColor;    // Palette index for transparency, -1 if none
+#define NO_TRANSPARENT_COLOR     -1
+} GraphicsControlBlock;
+
+static int DGifExtensionToGCB(const size_t GifExtensionLength,
+                              const GifByteType* GifExtension,
+                              GraphicsControlBlock* gcb) {
+  if (GifExtensionLength != 4) {
+    return GIF_ERROR;
+  }
+  gcb->DisposalMode = (GifExtension[0] >> 2) & 0x07;
+  gcb->UserInputFlag = (GifExtension[0] & 0x02) != 0;
+  gcb->DelayTime = GifExtension[1] | (GifExtension[2] << 8);
+  if (GifExtension[0] & 0x01) {
+    gcb->TransparentColor = (int)GifExtension[3];
+  } else {
+    gcb->TransparentColor = NO_TRANSPARENT_COLOR;
+  }
+  return GIF_OK;
+}
+
+static int DGifSavedExtensionToGCB(GifFileType* GifFile, int ImageIndex,
+                                   GraphicsControlBlock* gcb) {
+  int i;
+  if (ImageIndex < 0 || ImageIndex > GifFile->ImageCount - 1) {
+    return GIF_ERROR;
+  }
+  gcb->DisposalMode = DISPOSAL_UNSPECIFIED;
+  gcb->UserInputFlag = 0;
+  gcb->DelayTime = 0;
+  gcb->TransparentColor = NO_TRANSPARENT_COLOR;
+
+  for (i = 0; i < GifFile->SavedImages[ImageIndex].ExtensionBlockCount; i++) {
+    ExtensionBlock* ep = &GifFile->SavedImages[ImageIndex].ExtensionBlocks[i];
+    if (ep->Function == GRAPHICS_EXT_FUNC_CODE) {
+      return DGifExtensionToGCB(
+          ep->ByteCount, (const GifByteType*)ep->Bytes, gcb);
+    }
+  }
+  return GIF_ERROR;
+}
+
+#define CONTINUE_EXT_FUNC_CODE 0x00
+
+// Signature was changed in v5.0
+#define DGifOpenFileName(a, b) DGifOpenFileName(a)
+
+#endif  // !LOCAL_GIF_PREREQ(5, 0)
+
+// Signature changed in v5.1
+#if !LOCAL_GIF_PREREQ(5, 1)
+#define DGifCloseFile(a, b) DGifCloseFile(a)
+#endif
+
+static void GIFDisplayError(const GifFileType* const gif, int gif_error) {
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if LOCAL_GIF_PREREQ(4, 2)
+#if LOCAL_GIF_PREREQ(5, 0)
+  const char* error_str =
+      GifErrorString((gif == NULL) ? gif_error : gif->Error);
+#else
+  const char* error_str = GifErrorString();
+  (void)gif;
+#endif
+  if (error_str == NULL) error_str = "Unknown error";
+  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+  (void)gif;
+  fprintf(stderr, "GIFLib Error %d: ", gif_error);
+  PrintGifError();
+  fprintf(stderr, "\n");
+#endif
+}
+
+static int IsKeyFrameGIF(const GifImageDesc* prev_desc, int prev_dispose,
+                         const DecodedFrame* const prev_frame,
+                         int canvas_width, int canvas_height) {
+  if (prev_frame == NULL) return 1;
+  if (prev_dispose == DISPOSE_BACKGROUND) {
+    if (IsFullFrame(prev_desc->Width, prev_desc->Height,
+                    canvas_width, canvas_height)) {
+      return 1;
+    }
+    if (prev_frame->is_key_frame) return 1;
+  }
+  return 0;
+}
+
+static int GetTransparentIndexGIF(GifFileType* gif) {
+  GraphicsControlBlock first_gcb;
+  memset(&first_gcb, 0, sizeof(first_gcb));
+  DGifSavedExtensionToGCB(gif, 0, &first_gcb);
+  return first_gcb.TransparentColor;
+}
+
+static uint32_t GetBackgroundColorGIF(GifFileType* gif) {
+  const int transparent_index = GetTransparentIndexGIF(gif);
+  const ColorMapObject* const color_map = gif->SColorMap;
+  if (transparent_index != NO_TRANSPARENT_COLOR &&
+      gif->SBackGroundColor == transparent_index) {
+    return 0x00ffffff;  // Special case: transparent white.
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || gif->SBackGroundColor >= color_map->ColorCount) {
+    return 0xffffffff;  // Invalid: assume white.
+  } else {
+    const GifColorType color = color_map->Colors[gif->SBackGroundColor];
+    return (0xff << 24) |
+           (color.Red << 16) |
+           (color.Green << 8) |
+           (color.Blue << 0);
+  }
+}
+
+// Find appropriate app extension and get loop count from the next extension.
+static uint32_t GetLoopCountGIF(const GifFileType* const gif) {
+  int i;
+  for (i = 0; i < gif->ImageCount; ++i) {
+    const SavedImage* const image = &gif->SavedImages[i];
+    int j;
+    for (j = 0; (j + 1) < image->ExtensionBlockCount; ++j) {
+      const ExtensionBlock* const eb1 = image->ExtensionBlocks + j;
+      const ExtensionBlock* const eb2 = image->ExtensionBlocks + j + 1;
+      const char* const signature = (const char*)eb1->Bytes;
+      const int signature_is_ok =
+          (eb1->Function == APPLICATION_EXT_FUNC_CODE) &&
+          (eb1->ByteCount == 11) &&
+          (!memcmp(signature, "NETSCAPE2.0", 11) ||
+           !memcmp(signature, "ANIMEXTS1.0", 11));
+      if (signature_is_ok &&
+          eb2->Function == CONTINUE_EXT_FUNC_CODE && eb2->ByteCount >= 3 &&
+          eb2->Bytes[0] == 1) {
+        return ((uint32_t)(eb2->Bytes[2]) << 8) +
+               ((uint32_t)(eb2->Bytes[1]) << 0);
+      }
+    }
+  }
+  return 0;  // Default.
+}
+
+// Get duration of 'n'th frame in milliseconds.
+static int GetFrameDurationGIF(GifFileType* gif, int n) {
+  GraphicsControlBlock gcb;
+  memset(&gcb, 0, sizeof(gcb));
+  DGifSavedExtensionToGCB(gif, n, &gcb);
+  return gcb.DelayTime * 10;
+}
+
+// Returns true if frame 'target' completely covers 'covered'.
+static int CoversFrameGIF(const GifImageDesc* const target,
+                          const GifImageDesc* const covered) {
+  return target->Left <= covered->Left &&
+         covered->Left + covered->Width <= target->Left + target->Width &&
+         target->Top <= covered->Top &&
+         covered->Top + covered->Height <= target->Top + target->Height;
+}
+
+static void RemapPixelsGIF(const uint8_t* const src,
+                           const ColorMapObject* const cmap,
+                           int transparent_color, int len, uint8_t* dst) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    if (src[i] != transparent_color) {
+      // If a pixel in the current frame is transparent, we don't modify it, so
+      // that we can see-through the corresponding pixel from an earlier frame.
+      const GifColorType c = cmap->Colors[src[i]];
+      dst[4 * i + 0] = c.Red;
+      dst[4 * i + 1] = c.Green;
+      dst[4 * i + 2] = c.Blue;
+      dst[4 * i + 3] = 0xff;
+    }
+  }
+}
+
+static int ReadFrameGIF(const SavedImage* const gif_image,
+                        const ColorMapObject* cmap, int transparent_color,
+                        int out_stride, uint8_t* const dst) {
+  const GifImageDesc* image_desc = &gif_image->ImageDesc;
+  const uint8_t* in;
+  uint8_t* out;
+  int j;
+
+  if (image_desc->ColorMap) cmap = image_desc->ColorMap;
+
+  if (cmap == NULL || cmap->ColorCount != (1 << cmap->BitsPerPixel)) {
+    fprintf(stderr, "Potentially corrupt color map.\n");
+    return 0;
+  }
+
+  in = (const uint8_t*)gif_image->RasterBits;
+  out = dst + image_desc->Top * out_stride + image_desc->Left * kNumChannels;
+
+  for (j = 0; j < image_desc->Height; ++j) {
+    RemapPixelsGIF(in, cmap, transparent_color, image_desc->Width, out);
+    in += image_desc->Width;
+    out += out_stride;
+  }
+  return 1;
+}
+
+// Read animated GIF bitstream from 'filename' into 'AnimatedImage' struct.
+static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
+                           int dump_frames, const char dump_folder[]) {
+  uint32_t frame_count;
+  uint32_t canvas_width, canvas_height;
+  uint32_t i;
+  int gif_error;
+  GifFileType* gif;
+
+  gif = DGifOpenFileName(filename, NULL);
+  if (gif == NULL) {
+    fprintf(stderr, "Could not read file: %s.\n", filename);
+    return 0;
+  }
+
+  gif_error = DGifSlurp(gif);
+  if (gif_error != GIF_OK) {
+    fprintf(stderr, "Could not parse image: %s.\n", filename);
+    GIFDisplayError(gif, gif_error);
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }
+
+  // Animation properties.
+  image->canvas_width = (uint32_t)gif->SWidth;
+  image->canvas_height = (uint32_t)gif->SHeight;
+  if (image->canvas_width > MAX_CANVAS_SIZE ||
+      image->canvas_height > MAX_CANVAS_SIZE) {
+    fprintf(stderr, "Invalid canvas dimension: %d x %d\n",
+            image->canvas_width, image->canvas_height);
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }
+  image->loop_count = GetLoopCountGIF(gif);
+  image->bgcolor = GetBackgroundColorGIF(gif);
+
+  frame_count = (uint32_t)gif->ImageCount;
+  if (frame_count == 0) {
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }
+
+  if (image->canvas_width == 0 || image->canvas_height == 0) {
+    image->canvas_width = gif->SavedImages[0].ImageDesc.Width;
+    image->canvas_height = gif->SavedImages[0].ImageDesc.Height;
+    gif->SavedImages[0].ImageDesc.Left = 0;
+    gif->SavedImages[0].ImageDesc.Top = 0;
+    if (image->canvas_width == 0 || image->canvas_height == 0) {
+      fprintf(stderr, "Invalid canvas size in GIF.\n");
+      DGifCloseFile(gif, NULL);
+      return 0;
+    }
+  }
+  // Allocate frames.
+  AllocateFrames(image, frame_count);
+
+  canvas_width = image->canvas_width;
+  canvas_height = image->canvas_height;
+
+  // Decode and reconstruct frames.
+  for (i = 0; i < frame_count; ++i) {
+    const int canvas_width_in_bytes = canvas_width * kNumChannels;
+    const SavedImage* const curr_gif_image = &gif->SavedImages[i];
+    GraphicsControlBlock curr_gcb;
+    DecodedFrame* curr_frame;
+    uint8_t* curr_rgba;
+
+    memset(&curr_gcb, 0, sizeof(curr_gcb));
+    DGifSavedExtensionToGCB(gif, i, &curr_gcb);
+
+    curr_frame = &image->frames[i];
+    curr_rgba = curr_frame->rgba;
+    curr_frame->duration = GetFrameDurationGIF(gif, i);
+
+    if (i == 0) {  // Initialize as transparent.
+      curr_frame->is_key_frame = 1;
+      ZeroFillCanvas(curr_rgba, canvas_width, canvas_height);
+    } else {
+      DecodedFrame* const prev_frame = &image->frames[i - 1];
+      const GifImageDesc* const prev_desc = &gif->SavedImages[i - 1].ImageDesc;
+      GraphicsControlBlock prev_gcb;
+      memset(&prev_gcb, 0, sizeof(prev_gcb));
+      DGifSavedExtensionToGCB(gif, i - 1, &prev_gcb);
+
+      curr_frame->is_key_frame =
+          IsKeyFrameGIF(prev_desc, prev_gcb.DisposalMode, prev_frame,
+                        canvas_width, canvas_height);
+
+      if (curr_frame->is_key_frame) {  // Initialize as transparent.
+        ZeroFillCanvas(curr_rgba, canvas_width, canvas_height);
+      } else {
+        int prev_frame_disposed, curr_frame_opaque;
+        int prev_frame_completely_covered;
+        // Initialize with previous canvas.
+        uint8_t* const prev_rgba = image->frames[i - 1].rgba;
+        CopyCanvas(prev_rgba, curr_rgba, canvas_width, canvas_height);
+
+        // Dispose previous frame rectangle.
+        prev_frame_disposed =
+            (prev_gcb.DisposalMode == DISPOSE_BACKGROUND ||
+             prev_gcb.DisposalMode == DISPOSE_PREVIOUS);
+        curr_frame_opaque =
+            (curr_gcb.TransparentColor == NO_TRANSPARENT_COLOR);
+        prev_frame_completely_covered =
+            curr_frame_opaque &&
+            CoversFrameGIF(&curr_gif_image->ImageDesc, prev_desc);
+
+        if (prev_frame_disposed && !prev_frame_completely_covered) {
+          switch (prev_gcb.DisposalMode) {
+            case DISPOSE_BACKGROUND: {
+              ZeroFillFrameRect(curr_rgba, canvas_width_in_bytes,
+                                prev_desc->Left, prev_desc->Top,
+                                prev_desc->Width, prev_desc->Height);
+              break;
+            }
+            case DISPOSE_PREVIOUS: {
+              int src_frame_num = i - 2;
+              while (src_frame_num >= 0) {
+                GraphicsControlBlock src_frame_gcb;
+                memset(&src_frame_gcb, 0, sizeof(src_frame_gcb));
+                DGifSavedExtensionToGCB(gif, src_frame_num, &src_frame_gcb);
+                if (src_frame_gcb.DisposalMode != DISPOSE_PREVIOUS) break;
+                --src_frame_num;
+              }
+              if (src_frame_num >= 0) {
+                // Restore pixels inside previous frame rectangle to
+                // corresponding pixels in source canvas.
+                uint8_t* const src_frame_rgba =
+                    image->frames[src_frame_num].rgba;
+                CopyFrameRectangle(src_frame_rgba, curr_rgba,
+                                   canvas_width_in_bytes,
+                                   prev_desc->Left, prev_desc->Top,
+                                   prev_desc->Width, prev_desc->Height);
+              } else {
+                // Source canvas doesn't exist. So clear previous frame
+                // rectangle to background.
+                ZeroFillFrameRect(curr_rgba, canvas_width_in_bytes,
+                                  prev_desc->Left, prev_desc->Top,
+                                  prev_desc->Width, prev_desc->Height);
+              }
+              break;
+            }
+            default:
+              break;  // Nothing to do.
+          }
+        }
+      }
+    }
+
+    // Decode current frame.
+    if (!ReadFrameGIF(curr_gif_image, gif->SColorMap, curr_gcb.TransparentColor,
+                      canvas_width_in_bytes, curr_rgba)) {
+      DGifCloseFile(gif, NULL);
+      return 0;
+    }
+
+    if (dump_frames) {
+      if (!DumpFrame(filename, dump_folder, i, curr_rgba,
+                     canvas_width, canvas_height)) {
+        DGifCloseFile(gif, NULL);
+        return 0;
+      }
+    }
+  }
+  DGifCloseFile(gif, NULL);
+  return 1;
+}
+
+#else
+
+static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
+                           int dump_frames, const char dump_folder[]) {
+  (void)filename;
+  (void)image;
+  (void)dump_frames;
+  (void)dump_folder;
+  fprintf(stderr, "GIF support not compiled. Please install the libgif-dev "
+          "package before building.\n");
+  return 0;
+}
+
+#endif  // WEBP_HAVE_GIF
+
+// -----------------------------------------------------------------------------
+
+int ReadAnimatedImage(const char filename[], AnimatedImage* const image,
+                      int dump_frames, const char dump_folder[]) {
+  int ok = 0;
+  WebPData webp_data;
+
+  WebPDataInit(&webp_data);
+  memset(image, 0, sizeof(*image));
+
+  if (!ExUtilReadFile(filename, &webp_data.bytes, &webp_data.size)) {
+    fprintf(stderr, "Error reading file: %s\n", filename);
+    return 0;
+  }
+
+  if (IsWebP(&webp_data)) {
+    ok = ReadAnimatedWebP(filename, &webp_data, image, dump_frames,
+                          dump_folder);
+  } else if (IsGIF(&webp_data)) {
+    ok = ReadAnimatedGIF(filename, image, dump_frames, dump_folder);
+  } else {
+    fprintf(stderr,
+            "Unknown file type: %s. Supported file types are WebP and GIF\n",
+            filename);
+    ok = 0;
+  }
+  if (!ok) ClearAnimatedImage(image);
+  WebPDataClear(&webp_data);
+  return ok;
+}
+
+static void Accumulate(double v1, double v2, double* const max_diff,
+                       double* const sse) {
+  const double diff = fabs(v1 - v2);
+  if (diff > *max_diff) *max_diff = diff;
+  *sse += diff * diff;
+}
+
+void GetDiffAndPSNR(const uint8_t rgba1[], const uint8_t rgba2[],
+                    uint32_t width, uint32_t height, int premultiply,
+                    int* const max_diff, double* const psnr) {
+  const uint32_t stride = width * kNumChannels;
+  const int kAlphaChannel = kNumChannels - 1;
+  double f_max_diff = 0.;
+  double sse = 0.;
+  uint32_t x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < stride; x += kNumChannels) {
+      int k;
+      const size_t offset = y * stride + x;
+      const int alpha1 = rgba1[offset + kAlphaChannel];
+      const int alpha2 = rgba2[offset + kAlphaChannel];
+      Accumulate(alpha1, alpha2, &f_max_diff, &sse);
+      if (!premultiply) {
+        for (k = 0; k < kAlphaChannel; ++k) {
+          Accumulate(rgba1[offset + k], rgba2[offset + k], &f_max_diff, &sse);
+        }
+      } else {
+        // premultiply R/G/B channels with alpha value
+        for (k = 0; k < kAlphaChannel; ++k) {
+          Accumulate(rgba1[offset + k] * alpha1 / 255.,
+                     rgba2[offset + k] * alpha2 / 255.,
+                     &f_max_diff, &sse);
+        }
+      }
+    }
+  }
+  *max_diff = (int)f_max_diff;
+  if (*max_diff == 0) {
+    *psnr = 99.;  // PSNR when images are identical.
+  } else {
+    sse /= stride * height;
+    *psnr = 4.3429448 * log(255. * 255. / sse);
+  }
+}
--- a/examples/anim_util.h
+++ b/examples/anim_util.h
@ -0,0 +1,63 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for animated images
+
+#ifndef WEBP_EXAMPLES_ANIM_UTIL_H_
+#define WEBP_EXAMPLES_ANIM_UTIL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  uint8_t* rgba;         // Decoded and reconstructed full frame.
+  int duration;          // Frame duration in milliseconds.
+  int is_key_frame;      // True if this frame is a key-frame.
+} DecodedFrame;
+
+typedef struct {
+  uint32_t canvas_width;
+  uint32_t canvas_height;
+  uint32_t bgcolor;
+  uint32_t loop_count;
+  DecodedFrame* frames;
+  uint32_t num_frames;
+  void* raw_mem;
+} AnimatedImage;
+
+// Deallocate everything in 'image' (but not the object itself).
+void ClearAnimatedImage(AnimatedImage* const image);
+
+// Read animated image file into 'AnimatedImage' struct.
+// If 'dump_frames' is true, dump frames to 'dump_folder'.
+// Previous content of 'image' is obliterated.
+// Upon successful return, content of 'image' must be deleted by
+// calling 'ClearAnimatedImage'.
+int ReadAnimatedImage(const char filename[], AnimatedImage* const image,
+                      int dump_frames, const char dump_folder[]);
+
+// Given two RGBA buffers, calculate max pixel difference and PSNR.
+// If 'premultiply' is true, R/G/B values will be pre-multiplied by the
+// transparency before comparison.
+void GetDiffAndPSNR(const uint8_t rgba1[], const uint8_t rgba2[],
+                    uint32_t width, uint32_t height, int premultiply,
+                    int* const max_diff, double* const psnr);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_ANIM_UTIL_H_
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -48,60 +48,62 @@ extern void* VP8GetCPUInfo;   // opaque forward declaration.

 static int verbose = 0;

-static int ReadYUV(FILE* in_file, WebPPicture* const pic) {
+static int ReadYUV(const uint8_t* const data, size_t data_size,
+                   WebPPicture* const pic) {
+  int y;
  const int use_argb = pic->use_argb;
  const int uv_width = (pic->width + 1) / 2;
  const int uv_height = (pic->height + 1) / 2;
-  int y;
-  int ok = 0;
+  const int uv_plane_size = uv_width * uv_height;
+  const size_t expected_data_size =
+      pic->width * pic->height + 2 * uv_plane_size;
+
+  if (data_size != expected_data_size) {
+    fprintf(stderr,
+            "input data doesn't have the expected size (%d instead of %d)\n",
+            (int)data_size, (int)expected_data_size);
+    return 0;
+  }

  pic->use_argb = 0;
-  if (!WebPPictureAlloc(pic)) return ok;
+  if (!WebPPictureAlloc(pic)) return 0;

  for (y = 0; y < pic->height; ++y) {
-    if (fread(pic->y + y * pic->y_stride, pic->width, 1, in_file) != 1) {
-      goto End;
-    }
+    memcpy(pic->y + y * pic->y_stride, data + y * pic->width,
+           pic->width * sizeof(*pic->y));
  }
  for (y = 0; y < uv_height; ++y) {
-    if (fread(pic->u + y * pic->uv_stride, uv_width, 1, in_file) != 1)
-      goto End;
+    const uint8_t* const uv_data = data + pic->height * pic->y_stride;
+    memcpy(pic->u + y * pic->uv_stride, uv_data + y * uv_width,
+           uv_width * sizeof(*uv_data));
+    memcpy(pic->v + y * pic->uv_stride, uv_data + y * uv_width + uv_plane_size,
+           uv_width * sizeof(*uv_data));
  }
-  for (y = 0; y < uv_height; ++y) {
-    if (fread(pic->v + y * pic->uv_stride, uv_width, 1, in_file) != 1)
-      goto End;
-  }
-  ok = 1;
-  if (use_argb) ok = WebPPictureYUVAToARGB(pic);
-
- End:
-  return ok;
+  return use_argb ? WebPPictureYUVAToARGB(pic) : 1;
 }

 #ifdef HAVE_WINCODEC_H

 static int ReadPicture(const char* const filename, WebPPicture* const pic,
                       int keep_alpha, Metadata* const metadata) {
-  int ok;
+  int ok = 0;
+  const uint8_t* data = NULL;
+  size_t data_size = 0;
  if (pic->width != 0 && pic->height != 0) {
-    // If image size is specified, infer it as YUV format.
-    FILE* in_file = fopen(filename, "rb");
-    if (in_file == NULL) {
-      fprintf(stderr, "Error! Cannot open input file '%s'\n", filename);
-      return 0;
-    }
-    ok = ReadYUV(in_file, pic);
-    fclose(in_file);
+    ok = ExUtilReadFile(filename, &data, &data_size);
+    ok = ok && ReadYUV(data, data_size, pic);
  } else {
    // If no size specified, try to decode it using WIC.
    ok = ReadPictureWithWIC(filename, pic, keep_alpha, metadata);
    if (!ok) {
-      ok = ReadWebP(filename, pic, keep_alpha, metadata);
+      ok = ExUtilReadFile(filename, &data, &data_size);
+      ok = ok && ReadWebP(data, data_size, pic, keep_alpha, metadata);
    }
  }
  if (!ok) {
    fprintf(stderr, "Error! Could not process file %s\n", filename);
  }
+  free((void*)data);
  return ok;
 }

@ -115,18 +117,14 @@ typedef enum {
  UNSUPPORTED
 } InputFileFormat;

-static InputFileFormat GetImageType(FILE* in_file) {
+static uint32_t GetBE32(const uint8_t buf[]) {
+  return ((uint32_t)buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
+}
+
+static InputFileFormat GuessImageType(const uint8_t buf[12]) {
  InputFileFormat format = UNSUPPORTED;
-  uint32_t magic1, magic2;
-  uint8_t buf[12];
-
-  if ((fread(&buf[0], 12, 1, in_file) != 1) ||
-      (fseek(in_file, 0, SEEK_SET) != 0)) {
-    return format;
-  }
-
-  magic1 = ((uint32_t)buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
-  magic2 = ((uint32_t)buf[8] << 24) | (buf[9] << 16) | (buf[10] << 8) | buf[11];
+  const uint32_t magic1 = GetBE32(buf + 0);
+  const uint32_t magic2 = GetBE32(buf + 8);
  if (magic1 == 0x89504E47U) {
    format = PNG_;
  } else if (magic1 >= 0xFFD8FF00U && magic1 <= 0xFFD8FFFFU) {
@ -141,34 +139,36 @@ static InputFileFormat GetImageType(FILE* in_file) {

 static int ReadPicture(const char* const filename, WebPPicture* const pic,
                       int keep_alpha, Metadata* const metadata) {
+  const uint8_t* data = NULL;
+  size_t data_size = 0;
  int ok = 0;
-  FILE* in_file = fopen(filename, "rb");
-  if (in_file == NULL) {
-    fprintf(stderr, "Error! Cannot open input file '%s'\n", filename);
-    return ok;
-  }
+
+  ok = ExUtilReadFile(filename, &data, &data_size);
+  if (!ok) goto End;

  if (pic->width == 0 || pic->height == 0) {
-    // If no size specified, try to decode it as PNG/JPEG (as appropriate).
-    const InputFileFormat format = GetImageType(in_file);
-    if (format == PNG_) {
-      ok = ReadPNG(in_file, pic, keep_alpha, metadata);
-    } else if (format == JPEG_) {
-      ok = ReadJPEG(in_file, pic, metadata);
-    } else if (format == TIFF_) {
-      ok = ReadTIFF(filename, pic, keep_alpha, metadata);
-    } else if (format == WEBP_) {
-      ok = ReadWebP(filename, pic, keep_alpha, metadata);
+    ok = 0;
+    if (data_size >= 12) {
+      const InputFileFormat format = GuessImageType(data);
+      if (format == PNG_) {
+        ok = ReadPNG(data, data_size, pic, keep_alpha, metadata);
+      } else if (format == JPEG_) {
+        ok = ReadJPEG(data, data_size, pic, metadata);
+      } else if (format == TIFF_) {
+        ok = ReadTIFF(data, data_size, pic, keep_alpha, metadata);
+      } else if (format == WEBP_) {
+        ok = ReadWebP(data, data_size, pic, keep_alpha, metadata);
+      }
    }
  } else {
    // If image size is specified, infer it as YUV format.
-    ok = ReadYUV(in_file, pic);
+    ok = ReadYUV(data, data_size, pic);
  }
+ End:
  if (!ok) {
    fprintf(stderr, "Error! Could not process file %s\n", filename);
  }
-
-  fclose(in_file);
+  free((void*)data);
  return ok;
 }

@ -212,6 +212,8 @@ static void PrintFullLosslessInfo(const WebPAuxStats* const stats,
                                  const char* const description) {
  fprintf(stderr, "Lossless-%s compressed size: %d bytes\n",
          description, stats->lossless_size);
+  fprintf(stderr, "  * Header size: %d bytes, image data size: %d\n",
+          stats->lossless_hdr_size, stats->lossless_data_size);
  if (stats->lossless_features) {
    fprintf(stderr, "  * Lossless features used:");
    if (stats->lossless_features & 1) fprintf(stderr, " PREDICTION");
@ -574,10 +576,8 @@ static void HelpLong(void) {
  printf("                            default, photo, picture,\n");
  printf("                            drawing, icon, text\n");
  printf("     -preset must come first, as it overwrites other parameters\n");
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
  printf("  -z <int> ............... activates lossless preset with given\n"
         "                           level in [0:fast, ..., 9:slowest]\n");
-#endif
  printf("\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
  printf("  -segments <int> ........ number of segments to use (1..4)\n");
@ -608,13 +608,19 @@ static void HelpLong(void) {
  printf("  -alpha_method <int> .... transparency-compression method (0..1)\n");
  printf("  -alpha_filter <string> . predictive filtering for alpha plane,\n");
  printf("                           one of: none, fast (default) or best\n");
-  printf("  -alpha_cleanup ......... clean RGB values in transparent area\n");
+  printf("  -exact ................. preserve RGB values in transparent area"
+         "\n");
  printf("  -blend_alpha <hex> ..... blend colors against background color\n"
         "                           expressed as RGB values written in\n"
         "                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0\n"
         "                           green=0xe0 and blue=0xd0\n");
  printf("  -noalpha ............... discard any transparency information\n");
  printf("  -lossless .............. encode image losslessly\n");
+  printf("  -near_lossless <int> ... use near-lossless image\n"
+         "                           preprocessing (0..100=off)\n");
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  printf("  -delta_palettization ... use delta palettization\n");
+#endif  // WEBP_EXPERIMENTAL_FEATURES
  printf("  -hint <string> ......... specify image characteristics hint,\n");
  printf("                           one of: photo, picture or graph\n");

@ -679,10 +685,8 @@ int main(int argc, const char *argv[]) {
  uint32_t background_color = 0xffffffu;
  int crop = 0, crop_x = 0, crop_y = 0, crop_w = 0, crop_h = 0;
  int resize_w = 0, resize_h = 0;
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
  int lossless_preset = 6;
  int use_lossless_preset = -1;  // -1=unset, 0=don't use, 1=use it
-#endif
  int show_progress = 0;
  int keep_metadata = 0;
  int metadata_written = 0;
@ -736,25 +740,31 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-s") && c < argc - 2) {
      picture.width = ExUtilGetInt(argv[++c], 0, &parse_error);
      picture.height = ExUtilGetInt(argv[++c], 0, &parse_error);
+      if (picture.width > WEBP_MAX_DIMENSION || picture.width < 0 ||
+          picture.height > WEBP_MAX_DIMENSION ||  picture.height < 0) {
+        fprintf(stderr,
+                "Specified dimension (%d x %d) is out of range.\n",
+                picture.width, picture.height);
+        goto Error;
+      }
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
-#endif
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
    } else if (!strcmp(argv[c], "-z") && c < argc - 1) {
      lossless_preset = ExUtilGetInt(argv[++c], 0, &parse_error);
      if (use_lossless_preset != 0) use_lossless_preset = 1;
-#endif
    } else if (!strcmp(argv[c], "-alpha_q") && c < argc - 1) {
      config.alpha_quality = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_method") && c < argc - 1) {
      config.alpha_compression = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_cleanup")) {
-      keep_alpha = keep_alpha ? 2 : 0;
+      // This flag is obsolete, does opposite of -exact.
+      config.exact = 0;
+    } else if (!strcmp(argv[c], "-exact")) {
+      config.exact = 1;
    } else if (!strcmp(argv[c], "-blend_alpha") && c < argc - 1) {
      blend_alpha = 1;
      // background color is given in hex with an optional '0x' prefix
@ -776,6 +786,14 @@ int main(int argc, const char *argv[]) {
      keep_alpha = 0;
    } else if (!strcmp(argv[c], "-lossless")) {
      config.lossless = 1;
+    } else if (!strcmp(argv[c], "-near_lossless") && c < argc - 1) {
+      config.near_lossless = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.lossless = 1;  // use near-lossless only with lossless
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    } else if (!strcmp(argv[c], "-delta_palettization")) {
+      config.delta_palettization = 1;
+      config.lossless = 1;  // use delta-palettization only with lossless
+#endif  // WEBP_EXPERIMENTAL_FEATURES
    } else if (!strcmp(argv[c], "-hint") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "photo")) {
@ -836,7 +854,7 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetEncoderVersion();
      printf("%d.%d.%d\n",
-        (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-progress")) {
      show_progress = 1;
@ -935,14 +953,12 @@ int main(int argc, const char *argv[]) {
    goto Error;
  }

-#if WEBP_ENCODER_ABI_VERSION > 0x0202
  if (use_lossless_preset == 1) {
    if (!WebPConfigLosslessPreset(&config, lossless_preset)) {
      fprintf(stderr, "Invalid lossless preset (-z %d)\n", lossless_preset);
      goto Error;
    }
  }
-#endif

  // Check for unsupported command line options for lossless mode and log
  // warning for such options.
@ -962,7 +978,11 @@ int main(int argc, const char *argv[]) {
    goto Error;
  }

-  // Read the input
+  // Read the input. We need to decide if we prefer ARGB or YUVA
+  // samples, depending on the expected compression mode (this saves
+  // some conversion steps).
+  picture.use_argb = (config.lossless || config.preprocessing > 0 ||
+                      crop || (resize_w | resize_h) > 0);
  if (verbose) {
    StopwatchReset(&stop_watch);
  }
@ -977,10 +997,6 @@ int main(int argc, const char *argv[]) {
    WebPBlendAlpha(&picture, background_color);
  }

-  if (keep_alpha == 2) {
-    WebPCleanupTransparentArea(&picture);
-  }
-
  if (verbose) {
    const double read_time = StopwatchReadAndReset(&stop_watch);
    fprintf(stderr, "Time to read input: %.3fs\n", read_time);
@ -1017,7 +1033,7 @@ int main(int argc, const char *argv[]) {
    picture.user_data = (void*)in_file;
  }

-  // Compress
+  // Crop & resize.
  if (verbose) {
    StopwatchReset(&stop_watch);
  }
@ -1034,12 +1050,22 @@ int main(int argc, const char *argv[]) {
      goto Error;
    }
  }
+  if (verbose && (crop != 0 || (resize_w | resize_h) > 0)) {
+    const double preproc_time = StopwatchReadAndReset(&stop_watch);
+    fprintf(stderr, "Time to crop/resize picture: %.3fs\n", preproc_time);
+  }
+
  if (picture.extra_info_type > 0) {
    AllocExtraInfo(&picture);
  }
  if (print_distortion >= 0) {  // Save original picture for later comparison
    WebPPictureCopy(&picture, &original_picture);
  }
+
+  // Compress.
+  if (verbose) {
+    StopwatchReset(&stop_watch);
+  }
  if (!WebPEncode(&config, &picture)) {
    fprintf(stderr, "Error! Cannot encode picture as WebP\n");
    fprintf(stderr, "Error code: %d (%s)\n",
@ -1103,16 +1129,19 @@ int main(int argc, const char *argv[]) {
    if (print_distortion >= 0) {    // print distortion
      static const char* distortion_names[] = { "PSNR", "SSIM", "LSIM" };
      float values[5];
-      // Comparison is performed in YUVA colorspace.
-      if (original_picture.use_argb &&
-          !WebPPictureARGBToYUVA(&original_picture, WEBP_YUV420A)) {
-       fprintf(stderr, "Error while converting original picture to YUVA.\n");
-        goto Error;
-      }
-      if (picture.use_argb &&
-          !WebPPictureARGBToYUVA(&picture, WEBP_YUV420A)) {
-        fprintf(stderr, "Error while converting compressed picture to YUVA.\n");
-        goto Error;
+      if (picture.use_argb != original_picture.use_argb) {
+        // Somehow, the WebPEncode() call converted the original picture.
+        // We need to make both match before calling WebPPictureDistortion().
+        int ok = 0;
+        if (picture.use_argb) {
+          ok = WebPPictureYUVAToARGB(&original_picture);
+        } else {
+          ok = WebPPictureARGBToYUVA(&original_picture, WEBP_YUV420A);
+        }
+        if (!ok) {
+          fprintf(stderr, "Error while converting original picture.\n");
+          goto Error;
+        }
      }
      if (!WebPPictureDistortion(&picture, &original_picture,
                                 print_distortion, values)) {
@ -1120,9 +1149,14 @@ int main(int argc, const char *argv[]) {
        goto Error;
      }
      if (!short_output) {
-        fprintf(stderr, "%s: Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
-                distortion_names[print_distortion],
-                values[0], values[1], values[2], values[3], values[4]);
+        fprintf(stderr, "%s: ", distortion_names[print_distortion]);
+        if (picture.use_argb) {
+          fprintf(stderr, "B:%.2f G:%.2f R:%.2f A:%.2f  Total:%.2f\n",
+                  values[0], values[1], values[2], values[3], values[4]);
+        } else {
+          fprintf(stderr, "Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
+                  values[0], values[1], values[2], values[3], values[4]);
+        }
      } else {
        fprintf(stderr, "%7d %.4f\n", picture.stats->coded_size, values[4]);
      }
@ -1134,11 +1168,7 @@ int main(int argc, const char *argv[]) {
  return_value = 0;

 Error:
-#if WEBP_ENCODER_ABI_VERSION > 0x0203
  WebPMemoryWriterClear(&memory_writer);
-#else
-  free(memory_writer.mem);
-#endif
  free(picture.extra_info);
  MetadataFree(&metadata);
  WebPPictureFree(&picture);
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -44,6 +44,7 @@
 #include "./stopwatch.h"

 static int verbose = 0;
+static int quiet = 0;
 #ifndef WEBP_DLL
 #ifdef __cplusplus
 extern "C" {
@ -509,10 +510,12 @@ static int SaveOutput(const WebPDecBuffer* const buffer,
    fclose(fout);
  }
  if (ok) {
-    if (use_stdout) {
-      fprintf(stderr, "Saved to stdout\n");
-    } else {
-      fprintf(stderr, "Saved file %s\n", out_file);
+    if (!quiet) {
+      if (use_stdout) {
+        fprintf(stderr, "Saved to stdout\n");
+      } else {
+        fprintf(stderr, "Saved file %s\n", out_file);
+      }
    }
    if (verbose) {
      const double write_time = StopwatchReadAndReset(&stop_watch);
@ -546,19 +549,16 @@ static void Help(void) {
         "  -nofilter .... disable in-loop filtering\n"
         "  -nodither .... disable dithering\n"
         "  -dither <d> .. dithering strength (in 0..100)\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -alpha_dither  use alpha-plane dithering if needed\n"
-#endif
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
-         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+         "  -resize <w> <h> ......... scale the output (*after* any cropping)\n"
         "  -flip ........ flip the output vertically\n"
-#endif
         "  -alpha ....... only save the alpha plane\n"
         "  -incremental . use incremental decoding (useful for tests)\n"
         "  -h     ....... this help message\n"
         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
+         "  -quiet ....... quiet mode, don't print anything\n"
 #ifndef WEBP_DLL
         "  -noasm ....... disable all assembly optimizations\n"
 #endif
@ -607,6 +607,8 @@ int main(int argc, const char *argv[]) {
      format = BMP;
    } else if (!strcmp(argv[c], "-tiff")) {
      format = TIFF;
+    } else if (!strcmp(argv[c], "-quiet")) {
+      quiet = 1;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
@ -618,10 +620,8 @@ int main(int argc, const char *argv[]) {
      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
-#if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-alpha_dither")) {
      config.options.alpha_dithering_strength = 100;
-#endif
    } else if (!strcmp(argv[c], "-nodither")) {
      config.options.dithering_strength = 0;
    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
@ -633,14 +633,13 @@ int main(int argc, const char *argv[]) {
      config.options.crop_top    = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.options.crop_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.options.crop_height = ExUtilGetInt(argv[++c], 0, &parse_error);
-    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
+    } else if ((!strcmp(argv[c], "-scale") || !strcmp(argv[c], "-resize")) &&
+               c < argc - 2) {  // '-scale' is left for compatibility
      config.options.use_scaling = 1;
      config.options.scaled_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.options.scaled_height = ExUtilGetInt(argv[++c], 0, &parse_error);
-#if WEBP_DECODER_ABI_VERSION > 0x0203
    } else if (!strcmp(argv[c], "-flip")) {
      config.options.flip = 1;
-#endif
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
 #ifndef WEBP_DLL
@ -672,6 +671,8 @@ int main(int argc, const char *argv[]) {
    return -1;
  }

+  if (quiet) verbose = 0;
+
  {
    VP8StatusCode status = VP8_STATUS_OK;
    size_t data_size = 0;
@ -728,20 +729,24 @@ int main(int argc, const char *argv[]) {
  }

  if (out_file != NULL) {
-    fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
-                    "Now saving...\n",
-            in_file, output_buffer->width, output_buffer->height,
-            bitstream->has_alpha ? " (with alpha)" : "",
-            kFormatType[bitstream->format]);
+    if (!quiet) {
+      fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
+                      "Now saving...\n",
+              in_file, output_buffer->width, output_buffer->height,
+              bitstream->has_alpha ? " (with alpha)" : "",
+              kFormatType[bitstream->format]);
+    }
    ok = SaveOutput(output_buffer, format, out_file);
  } else {
-    fprintf(stderr, "File %s can be decoded "
-                    "(dimensions: %d x %d %s. Format: %s).\n",
-            in_file, output_buffer->width, output_buffer->height,
-            bitstream->has_alpha ? " (with alpha)" : "",
-            kFormatType[bitstream->format]);
-    fprintf(stderr, "Nothing written; "
-                    "use -o flag to save the result as e.g. PNG.\n");
+    if (!quiet) {
+      fprintf(stderr, "File %s can be decoded "
+                      "(dimensions: %d x %d %s. Format: %s).\n",
+              in_file, output_buffer->width, output_buffer->height,
+              bitstream->has_alpha ? " (with alpha)" : "",
+              kFormatType[bitstream->format]);
+      fprintf(stderr, "Nothing written; "
+                      "use -o flag to save the result as e.g. PNG.\n");
+    }
  }
 Exit:
  WebPFreeDecBuffer(output_buffer);
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -243,7 +243,19 @@ VP8StatusCode ExUtilDecodeWebPIncremental(
      fprintf(stderr, "Failed during WebPINewDecoder().\n");
      return VP8_STATUS_OUT_OF_MEMORY;
    } else {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      size_t size = 0;
+      const size_t incr = 2 + (data_size / 20);
+      while (size < data_size) {
+        size_t next_size = size + (rand() % incr);
+        if (next_size > data_size) next_size = data_size;
+        status = WebPIUpdate(idec, data, next_size);
+        if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) break;
+        size = next_size;
+      }
+#else
      status = WebPIUpdate(idec, data, data_size);
+#endif
      WebPIDelete(idec);
    }
  }
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -14,6 +14,7 @@

 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>

 #ifdef HAVE_CONFIG_H
@ -26,182 +27,11 @@
 #include "webp/encode.h"
 #include "webp/mux.h"
 #include "./example_util.h"
-#include "./gif2webp_util.h"
-
-// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
-#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
-# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
-# define LOCAL_GIF_PREREQ(maj, min) \
-    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_GIF_VERSION 0
-# define LOCAL_GIF_PREREQ(maj, min) 0
-#endif
-
-#define GIF_TRANSPARENT_MASK 0x01
-#define GIF_DISPOSE_MASK     0x07
-#define GIF_DISPOSE_SHIFT    2
-#define WHITE_COLOR          0xffffffff
-#define MAX_CACHE_SIZE       30
+#include "./gifdec.h"

 //------------------------------------------------------------------------------

-static int transparent_index = -1;  // Opaque frame by default.
-
-static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
-                                      size_t* const kmax_ptr) {
-  size_t kmin = *kmin_ptr;
-  size_t kmax = *kmax_ptr;
-  int print_warning = 1;
-
-  if (kmin == 0) {  // Disable keyframe insertion.
-    kmax = ~0;
-    kmin = kmax - 1;
-    print_warning = 0;
-  }
-  if (kmax == 0) {
-    kmax = ~0;
-    print_warning = 0;
-  }
-
-  if (kmin >= kmax) {
-    kmin = kmax - 1;
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmin < kmax.\n", (int)kmin);
-    }
-  } else if (kmin < (kmax / 2 + 1)) {
-    // This ensures that cache.keyframe + kmin >= kmax is always true. So, we
-    // can flush all the frames in the ‘count_since_key_frame == kmax’ case.
-    kmin = (kmax / 2 + 1);
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmin >= kmax / 2 + 1.\n",
-              (int)kmin);
-    }
-  }
-  // Limit the max number of frames that are allocated.
-  if (kmax - kmin > MAX_CACHE_SIZE) {
-    kmin = kmax - MAX_CACHE_SIZE;
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmax - kmin <= 30.\n",
-              (int)kmin);
-    }
-  }
-  *kmin_ptr = kmin;
-  *kmax_ptr = kmax;
-}
-
-static void Remap(const uint8_t* const src, const GifFileType* const gif,
-                  uint32_t* dst, int len) {
-  int i;
-  const GifColorType* colors;
-  const ColorMapObject* const cmap =
-      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
-  if (cmap == NULL) return;
-  colors = cmap->Colors;
-
-  for (i = 0; i < len; ++i) {
-    const GifColorType c = colors[src[i]];
-    dst[i] = (src[i] == transparent_index) ? WEBP_UTIL_TRANSPARENT_COLOR
-           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
-  }
-}
-
-// Read the GIF image frame.
-static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
-                     WebPPicture* const webp_frame) {
-  WebPPicture sub_image;
-  const GifImageDesc* const image_desc = &gif->Image;
-  uint32_t* dst = NULL;
-  uint8_t* tmp = NULL;
-  int ok = 0;
-  WebPFrameRect rect = {
-      image_desc->Left, image_desc->Top, image_desc->Width, image_desc->Height
-  };
-  *gif_rect = rect;
-
-  // Use a view for the sub-picture:
-  if (!WebPPictureView(webp_frame, rect.x_offset, rect.y_offset,
-                       rect.width, rect.height, &sub_image)) {
-    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
-            rect.width, rect.height, rect.x_offset, rect.y_offset);
-    return 0;
-  }
-  dst = sub_image.argb;
-
-  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
-  if (tmp == NULL) goto End;
-
-  if (image_desc->Interlace) {  // Interlaced image.
-    // We need 4 passes, with the following offsets and jumps.
-    const int interlace_offsets[] = { 0, 4, 2, 1 };
-    const int interlace_jumps[]   = { 8, 8, 4, 2 };
-    int pass;
-    for (pass = 0; pass < 4; ++pass) {
-      int y;
-      for (y = interlace_offsets[pass]; y < rect.height;
-           y += interlace_jumps[pass]) {
-        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-        Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
-      }
-    }
-  } else {  // Non-interlaced image.
-    int y;
-    for (y = 0; y < rect.height; ++y) {
-      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-      Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
-    }
-  }
-  ok = 1;
-
- End:
-  if (!ok) webp_frame->error_code = sub_image.error_code;
-  WebPPictureFree(&sub_image);
-  free(tmp);
-  return ok;
-}
-
-static void GetBackgroundColor(const ColorMapObject* const color_map,
-                               int bgcolor_idx, uint32_t* const bgcolor) {
-  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
-    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
-  } else if (color_map == NULL || color_map->Colors == NULL
-             || bgcolor_idx >= color_map->ColorCount) {
-    *bgcolor = WHITE_COLOR;
-    fprintf(stderr,
-            "GIF decode warning: invalid background color index. Assuming "
-            "white background.\n");
-  } else {
-    const GifColorType color = color_map->Colors[bgcolor_idx];
-    *bgcolor = (0xff        << 24)
-             | (color.Red   << 16)
-             | (color.Green <<  8)
-             | (color.Blue  <<  0);
-  }
-}
-
-static void DisplayGifError(const GifFileType* const gif, int gif_error) {
-  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
-#if LOCAL_GIF_PREREQ(4,2)
-#if LOCAL_GIF_PREREQ(5,0)
-  // Static string actually, hence the const char* cast.
-  const char* error_str = (const char*)GifErrorString(
-      (gif == NULL) ? gif_error : gif->Error);
-#else
-  const char* error_str = (const char*)GifErrorString();
-  (void)gif;
-#endif
-  if (error_str == NULL) error_str = "Unknown error";
-  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
-#else
-  (void)gif;
-  fprintf(stderr, "GIFLib Error %d: ", gif_error);
-  PrintGifError();
-  fprintf(stderr, "\n");
-#endif
-}
+static int transparent_index = GIF_INDEX_INVALID;  // Opaque by default.

 static const char* const kErrorMessages[-WEBP_MUX_NOT_ENOUGH_DATA + 1] = {
  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
@ -231,6 +61,10 @@ static void Help(void) {
         "                           or lossless compression heuristically\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
+  printf("  -min_size .............. minimize output size (default:off)\n"
+         "                           lossless compression by default; can be\n"
+         "                           combined with -q, -m, -lossy or -mixed\n"
+         "                           options\n");
  printf("  -kmin <int> ............ min distance between key frames\n");
  printf("  -kmax <int> ............ max distance between key frames\n");
  printf("  -f <int> ............... filter strength (0=off..100)\n");
@ -257,35 +91,50 @@ int main(int argc, const char *argv[]) {
  const char *in_file = NULL, *out_file = NULL;
  FILE* out = NULL;
  GifFileType* gif = NULL;
+  int frame_duration = 0;
+  int frame_timestamp = 0;
+  GIFDisposeMethod orig_dispose = GIF_DISPOSE_NONE;
+
+  WebPPicture frame;                // Frame rectangle only (not disposed).
+  WebPPicture curr_canvas;          // Not disposed.
+  WebPPicture prev_canvas;          // Disposed.
+  WebPPicture prev_to_prev_canvas;  // Disposed.
+
+  WebPAnimEncoder* enc = NULL;
+  WebPAnimEncoderOptions enc_options;
  WebPConfig config;
-  WebPPicture frame;
-  int duration = 0;
-  FrameDisposeMethod orig_dispose = FRAME_DISPOSE_NONE;
-  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
-  WebPFrameCache* cache = NULL;

  int is_first_frame = 1;     // Whether we are processing the first frame.
  int done;
  int c;
  int quiet = 0;
-  WebPMux* mux = NULL;
-  WebPData webp_data = { NULL, 0 };
+  WebPData webp_data;
+
  int keep_metadata = METADATA_XMP;  // ICC not output by default.
-  int stored_icc = 0;  // Whether we have already stored an ICC profile.
-  int stored_xmp = 0;
+  WebPData icc_data;
+  int stored_icc = 0;         // Whether we have already stored an ICC profile.
+  WebPData xmp_data;
+  int stored_xmp = 0;         // Whether we have already stored an XMP profile.
+  int loop_count = 0;
+  int stored_loop_count = 0;  // Whether we have found an explicit loop count.
+  WebPMux* mux = NULL;

  int default_kmin = 1;  // Whether to use default kmin value.
  int default_kmax = 1;
-  size_t kmin = 0;
-  size_t kmax = 0;
-  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.

-  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
+  if (!WebPConfigInit(&config) || !WebPAnimEncoderOptionsInit(&enc_options) ||
+      !WebPPictureInit(&frame) || !WebPPictureInit(&curr_canvas) ||
+      !WebPPictureInit(&prev_canvas) ||
+      !WebPPictureInit(&prev_to_prev_canvas)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    return -1;
  }
  config.lossless = 1;  // Use lossless compression by default.

+  WebPDataInit(&webp_data);
+  WebPDataInit(&icc_data);
+  WebPDataInit(&xmp_data);
+
  if (argc == 1) {
    Help();
    return 0;
@ -301,17 +150,19 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-lossy")) {
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-mixed")) {
-      allow_mixed = 1;
+      enc_options.allow_mixed = 1;
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-min_size")) {
+      enc_options.minimize_size = 1;
    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
-      kmax = ExUtilGetUInt(argv[++c], 0, &parse_error);
+      enc_options.kmax = ExUtilGetInt(argv[++c], 0, &parse_error);
      default_kmax = 0;
    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
-      kmin = ExUtilGetUInt(argv[++c], 0, &parse_error);
+      enc_options.kmin = ExUtilGetInt(argv[++c], 0, &parse_error);
      default_kmin = 0;
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
@ -368,6 +219,7 @@ int main(int argc, const char *argv[]) {
      quiet = 1;
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
+      enc_options.verbose = 1;
    } else if (!strcmp(argv[c], "--")) {
      if (c < argc - 1) in_file = argv[++c];
      break;
@ -387,12 +239,11 @@ int main(int argc, const char *argv[]) {

  // Appropriate default kmin, kmax values for lossy and lossless.
  if (default_kmin) {
-    kmin = config.lossless ? 9 : 3;
+    enc_options.kmin = config.lossless ? 9 : 3;
  }
  if (default_kmax) {
-    kmax = config.lossless ? 17 : 5;
+    enc_options.kmax = config.lossless ? 17 : 5;
  }
-  SanitizeKeyFrameIntervals(&kmin, &kmax);

  if (!WebPValidateConfig(&config)) {
    fprintf(stderr, "Error! Invalid configuration.\n");
@ -413,12 +264,6 @@ int main(int argc, const char *argv[]) {
 #endif
  if (gif == NULL) goto End;

-  mux = WebPMuxNew();
-  if (mux == NULL) {
-    fprintf(stderr, "ERROR: could not create a mux object.\n");
-    goto End;
-  }
-
  // Loop over GIF images
  done = 0;
  do {
@ -427,17 +272,17 @@ int main(int argc, const char *argv[]) {

    switch (type) {
      case IMAGE_DESC_RECORD_TYPE: {
-        WebPFrameRect gif_rect;
+        GIFFrameRect gif_rect;
        GifImageDesc* const image_desc = &gif->Image;

        if (!DGifGetImageDesc(gif)) goto End;

-        // Fix some broken GIF global headers that report
-        // 0 x 0 screen dimension.
        if (is_first_frame) {
          if (verbose) {
            printf("Canvas screen: %d x %d\n", gif->SWidth, gif->SHeight);
          }
+          // Fix some broken GIF global headers that report
+          // 0 x 0 screen dimension.
          if (gif->SWidth == 0 || gif->SHeight == 0) {
            image_desc->Left = 0;
            image_desc->Top = 0;
@ -451,61 +296,64 @@ int main(int argc, const char *argv[]) {
                     gif->SWidth, gif->SHeight);
            }
          }
-#if WEBP_MUX_ABI_VERSION > 0x0101
-          // Set definitive canvas size.
-          err = WebPMuxSetCanvasSize(mux, gif->SWidth, gif->SHeight);
-          if (err != WEBP_MUX_OK) {
-            fprintf(stderr, "Invalid canvas size %d x %d\n",
-                    gif->SWidth, gif->SHeight);
-            goto End;
-          }
-#endif
          // Allocate current buffer.
          frame.width = gif->SWidth;
          frame.height = gif->SHeight;
          frame.use_argb = 1;
          if (!WebPPictureAlloc(&frame)) goto End;
-          WebPUtilClearPic(&frame, NULL);
-
-          // Initialize cache.
-          cache = WebPFrameCacheNew(frame.width, frame.height,
-                                    kmin, kmax, allow_mixed);
-          if (cache == NULL) goto End;
+          GIFClearPic(&frame, NULL);
+          WebPPictureCopy(&frame, &curr_canvas);
+          WebPPictureCopy(&frame, &prev_canvas);
+          WebPPictureCopy(&frame, &prev_to_prev_canvas);

          // Background color.
-          GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
-                             &anim.bgcolor);
+          GIFGetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
+                                transparent_index,
+                                &enc_options.anim_params.bgcolor);
+
+          // Initialize encoder.
+          enc = WebPAnimEncoderNew(curr_canvas.width, curr_canvas.height,
+                                   &enc_options);
+          if (enc == NULL) {
+            fprintf(stderr,
+                    "Error! Could not create encoder object. Possibly due to "
+                    "a memory error.\n");
+            goto End;
+          }
+          is_first_frame = 0;
        }
+
        // Some even more broken GIF can have sub-rect with zero width/height.
        if (image_desc->Width == 0 || image_desc->Height == 0) {
          image_desc->Width = gif->SWidth;
          image_desc->Height = gif->SHeight;
        }

-        if (!ReadFrame(gif, &gif_rect, &frame)) {
+        if (!GIFReadFrame(gif, transparent_index, &gif_rect, &frame)) {
          goto End;
        }
+        // Blend frame rectangle with previous canvas to compose full canvas.
+        // Note that 'curr_canvas' is same as 'prev_canvas' at this point.
+        GIFBlendFrames(&frame, &gif_rect, &curr_canvas);

-        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, orig_dispose,
-                                    duration, &frame)) {
-          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
-          fprintf(stderr, "Error code: %d\n", frame.error_code);
+        if (!WebPAnimEncoderAdd(enc, &curr_canvas, frame_timestamp, &config)) {
+          fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
        }

-        err = WebPFrameCacheFlush(cache, verbose, mux);
-        if (err != WEBP_MUX_OK) {
-          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
-                  ErrorString(err));
-          goto End;
-        }
-        is_first_frame = 0;
+        // Update canvases.
+        GIFCopyPixels(&prev_canvas, &prev_to_prev_canvas);
+        GIFDisposeFrame(orig_dispose, &gif_rect, &prev_canvas, &curr_canvas);
+        GIFCopyPixels(&curr_canvas, &prev_canvas);
+
+        // Update timestamp (for next frame).
+        frame_timestamp += frame_duration;

        // In GIF, graphic control extensions are optional for a frame, so we
        // may not get one before reading the next frame. To handle this case,
        // we reset frame properties to reasonable defaults for the next frame.
-        orig_dispose = FRAME_DISPOSE_NONE;
-        duration = 0;
-        transparent_index = -1;  // Opaque frame by default.
+        orig_dispose = GIF_DISPOSE_NONE;
+        frame_duration = 0;
+        transparent_index = GIF_INDEX_INVALID;
        break;
      }
      case EXTENSION_RECORD_TYPE: {
@ -519,25 +367,10 @@ int main(int argc, const char *argv[]) {
            break;  // Do nothing for now.
          }
          case GRAPHICS_EXT_FUNC_CODE: {
-            const int flags = data[1];
-            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
-            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
-            if (data[0] != 4) goto End;
-            duration = delay * 10;  // Duration is in 1 ms units for WebP.
-            switch (dispose) {
-              case 3:
-                orig_dispose = FRAME_DISPOSE_RESTORE_PREVIOUS;
-                break;
-              case 2:
-                orig_dispose = FRAME_DISPOSE_BACKGROUND;
-                break;
-              case 1:
-              case 0:
-              default:
-                orig_dispose = FRAME_DISPOSE_NONE;
-                break;
+            if (!GIFReadGraphicsExtension(data, &frame_duration, &orig_dispose,
+                                          &transparent_index)) {
+              goto End;
            }
-            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
            break;
          }
          case PLAINTEXT_EXT_FUNC_CODE: {
@ -547,14 +380,13 @@ int main(int argc, const char *argv[]) {
            if (data[0] != 11) break;    // Chunk is too short
            if (!memcmp(data + 1, "NETSCAPE2.0", 11) ||
                !memcmp(data + 1, "ANIMEXTS1.0", 11)) {
-              // Recognize and parse Netscape2.0 NAB extension for loop count.
-              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
-              if (data == NULL) goto End;  // Loop count sub-block missing.
-              if (data[0] < 3 || data[1] != 1) break;   // wrong size/marker
-              anim.loop_count = data[2] | (data[3] << 8);
-              if (verbose) {
-                fprintf(stderr, "Loop count: %d\n", anim.loop_count);
+              if (!GIFReadLoopCount(gif, &data, &loop_count)) {
+                goto End;
              }
+              if (verbose) {
+                fprintf(stderr, "Loop count: %d\n", loop_count);
+              }
+              stored_loop_count = (loop_count != 0);
            } else {  // An extension containing metadata.
              // We only store the first encountered chunk of each type, and
              // only if requested by the user.
@ -565,56 +397,8 @@ int main(int argc, const char *argv[]) {
                                 !stored_icc &&
                                 !memcmp(data + 1, "ICCRGBG1012", 11);
              if (is_xmp || is_icc) {
-                const char* const fourccs[2] = { "XMP " , "ICCP" };
-                const char* const features[2] = { "XMP" , "ICC" };
-                WebPData metadata = { NULL, 0 };
-                // Construct metadata from sub-blocks.
-                // Usual case (including ICC profile): In each sub-block, the
-                // first byte specifies its size in bytes (0 to 255) and the
-                // rest of the bytes contain the data.
-                // Special case for XMP data: In each sub-block, the first byte
-                // is also part of the XMP payload. XMP in GIF also has a 257
-                // byte padding data. See the XMP specification for details.
-                while (1) {
-                  WebPData prev_metadata = metadata;
-                  WebPData subblock;
-                  if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) {
-                    WebPDataClear(&metadata);
-                    goto End;
-                  }
-                  if (data == NULL) break;  // Finished.
-                  subblock.size = is_xmp ? data[0] + 1 : data[0];
-                  assert(subblock.size > 0);
-                  subblock.bytes = is_xmp ? data : data + 1;
-                  metadata.bytes =
-                      (uint8_t*)realloc((void*)metadata.bytes,
-                                        prev_metadata.size + subblock.size);
-                  if (metadata.bytes == NULL) {
-                    WebPDataClear(&prev_metadata);
-                    goto End;
-                  }
-                  metadata.size += subblock.size;
-                  memcpy((void*)(metadata.bytes + prev_metadata.size),
-                         subblock.bytes, subblock.size);
-                }
-                if (is_xmp) {
-                  // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
-                  const size_t xmp_pading_size = 257;
-                  if (metadata.size > xmp_pading_size) {
-                    metadata.size -= xmp_pading_size;
-                  }
-                }
-
-                // Add metadata chunk.
-                err = WebPMuxSetChunk(mux, fourccs[is_icc], &metadata, 1);
-                if (verbose) {
-                  fprintf(stderr, "%s size: %d\n",
-                          features[is_icc], (int)metadata.size);
-                }
-                WebPDataClear(&metadata);
-                if (err != WEBP_MUX_OK) {
-                  fprintf(stderr, "ERROR (%s): Could not set %s chunk.\n",
-                          ErrorString(err), features[is_icc]);
+                if (!GIFReadMetadata(gif, &data,
+                                     is_xmp ? &xmp_data : &icc_data)) {
                  goto End;
                }
                if (is_icc) {
@ -648,38 +432,88 @@ int main(int argc, const char *argv[]) {
    }
  } while (!done);

-  // Flush any pending frames.
-  err = WebPFrameCacheFlushAll(cache, verbose, mux);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
-            ErrorString(err));
+  // Last NULL frame.
+  if (!WebPAnimEncoderAdd(enc, NULL, frame_timestamp, NULL)) {
+    fprintf(stderr, "Error flushing WebP muxer.\n");
+    fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
+  }
+
+  if (!WebPAnimEncoderAssemble(enc, &webp_data)) {
+    fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
    goto End;
  }

-  // Finish muxing
-  err = WebPMuxSetAnimationParams(mux, &anim);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s): Could not set animation parameters.\n",
-            ErrorString(err));
-    goto End;
+  if (stored_loop_count || stored_icc || stored_xmp) {
+    // Re-mux to add loop count and/or metadata as needed.
+    mux = WebPMuxCreate(&webp_data, 1);
+    if (mux == NULL) {
+      fprintf(stderr, "ERROR: Could not re-mux to add loop count/metadata.\n");
+      goto End;
+    }
+    WebPDataClear(&webp_data);
+
+    if (stored_loop_count) {  // Update loop count.
+      WebPMuxAnimParams new_params;
+      err = WebPMuxGetAnimationParams(mux, &new_params);
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not fetch loop count.\n",
+                ErrorString(err));
+        goto End;
+      }
+      new_params.loop_count = loop_count;
+      err = WebPMuxSetAnimationParams(mux, &new_params);
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not update loop count.\n",
+                ErrorString(err));
+        goto End;
+      }
+    }
+
+    if (stored_icc) {   // Add ICCP chunk.
+      err = WebPMuxSetChunk(mux, "ICCP", &icc_data, 1);
+      if (verbose) {
+        fprintf(stderr, "ICC size: %d\n", (int)icc_data.size);
+      }
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not set ICC chunk.\n",
+                ErrorString(err));
+        goto End;
+      }
+    }
+
+    if (stored_xmp) {   // Add XMP chunk.
+      err = WebPMuxSetChunk(mux, "XMP ", &xmp_data, 1);
+      if (verbose) {
+        fprintf(stderr, "XMP size: %d\n", (int)xmp_data.size);
+      }
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not set XMP chunk.\n",
+                ErrorString(err));
+        goto End;
+      }
+    }
+
+    err = WebPMuxAssemble(mux, &webp_data);
+    if (err != WEBP_MUX_OK) {
+      fprintf(stderr, "ERROR (%s): Could not assemble when re-muxing to add "
+              "loop count/metadata.\n", ErrorString(err));
+      goto End;
+    }
  }

-  err = WebPMuxAssemble(mux, &webp_data);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s) assembling the WebP file.\n", ErrorString(err));
-    goto End;
-  }
  if (out_file != NULL) {
    if (!ExUtilWriteFile(out_file, webp_data.bytes, webp_data.size)) {
      fprintf(stderr, "Error writing output file: %s\n", out_file);
      goto End;
    }
    if (!quiet) {
-      fprintf(stderr, "Saved output file: %s\n", out_file);
+      fprintf(stderr, "Saved output file (%d bytes): %s\n",
+              (int)webp_data.size, out_file);
    }
  } else {
    if (!quiet) {
-      fprintf(stderr, "Nothing written; use -o flag to save the result.\n");
+      fprintf(stderr, "Nothing written; use -o flag to save the result "
+                      "(%d bytes).\n", (int)webp_data.size);
    }
  }

@ -688,14 +522,19 @@ int main(int argc, const char *argv[]) {
  gif_error = GIF_OK;

 End:
-  WebPDataClear(&webp_data);
+  WebPDataClear(&icc_data);
+  WebPDataClear(&xmp_data);
  WebPMuxDelete(mux);
+  WebPDataClear(&webp_data);
  WebPPictureFree(&frame);
-  WebPFrameCacheDelete(cache);
+  WebPPictureFree(&curr_canvas);
+  WebPPictureFree(&prev_canvas);
+  WebPPictureFree(&prev_to_prev_canvas);
+  WebPAnimEncoderDelete(enc);
  if (out != NULL && out_file != NULL) fclose(out);

  if (gif_error != GIF_OK) {
-    DisplayGifError(gif, gif_error);
+    GIFDisplayError(gif, gif_error);
  }
  if (gif != NULL) {
 #if LOCAL_GIF_PREREQ(5,1)
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -1,88 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Helper structs and methods for gif2webp tool.
-//
-// Author: Urvang (urvang@google.com)
-
-#ifndef WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
-#define WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
-
-#include <stdlib.h>
-
-#include "webp/mux.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-// Helper utilities.
-
-#define WEBP_UTIL_TRANSPARENT_COLOR 0x00ffffff
-
-struct WebPPicture;
-
-// Includes all disposal methods, even the ones not supported by WebP bitstream.
-typedef enum FrameDisposeMethod {
-  FRAME_DISPOSE_NONE,
-  FRAME_DISPOSE_BACKGROUND,
-  FRAME_DISPOSE_RESTORE_PREVIOUS
-} FrameDisposeMethod;
-
-typedef struct {
-  int x_offset, y_offset, width, height;
-} WebPFrameRect;
-
-// Clear pixels in 'picture' within given 'rect' to transparent color.
-void WebPUtilClearPic(struct WebPPicture* const picture,
-                      const WebPFrameRect* const rect);
-
-//------------------------------------------------------------------------------
-// Frame cache.
-
-typedef struct WebPFrameCache WebPFrameCache;
-
-// Given the minimum distance between key frames 'kmin' and maximum distance
-// between key frames 'kmax', returns an appropriately allocated cache object.
-// If 'allow_mixed' is true, the subsequent calls to WebPFrameCacheAddFrame()
-// will heuristically pick lossy or lossless compression for each frame.
-// Use WebPFrameCacheDelete() to deallocate the 'cache'.
-WebPFrameCache* WebPFrameCacheNew(int width, int height,
-                                  size_t kmin, size_t kmax, int allow_mixed);
-
-// Release all the frame data from 'cache' and free 'cache'.
-void WebPFrameCacheDelete(WebPFrameCache* const cache);
-
-// Given an image described by 'frame', 'rect', 'dispose_method' and 'duration',
-// optimize it for WebP, encode it and add it to 'cache'. 'rect' can be NULL.
-// This takes care of frame disposal too, according to 'dispose_method'.
-// Returns false in case of error (and sets frame->error_code accordingly).
-int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
-                           const WebPConfig* const config,
-                           const WebPFrameRect* const rect,
-                           FrameDisposeMethod dispose_method, int duration,
-                           WebPPicture* const frame);
-
-// Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
-// true, prints the information about these frames.
-WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
-                                 WebPMux* const mux);
-
-// Similar to 'WebPFrameCacheFlushFrames()', but flushes *all* the frames.
-WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
-                                    WebPMux* const mux);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  // WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
--- a/examples/gifdec.c
+++ b/examples/gifdec.c
@ -0,0 +1,396 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// GIF decode.
+
+#include "./gifdec.h"
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_GIF
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "webp/mux_types.h"
+
+#define GIF_TRANSPARENT_COLOR 0x00ffffff
+#define GIF_WHITE_COLOR       0xffffffff
+#define GIF_TRANSPARENT_MASK  0x01
+#define GIF_DISPOSE_MASK      0x07
+#define GIF_DISPOSE_SHIFT     2
+
+// from utils/utils.h
+extern void WebPCopyPlane(const uint8_t* src, int src_stride,
+                          uint8_t* dst, int dst_stride,
+                          int width, int height);
+extern void WebPCopyPixels(const WebPPicture* const src,
+                           WebPPicture* const dst);
+
+void GIFGetBackgroundColor(const ColorMapObject* const color_map,
+                           int bgcolor_index, int transparent_index,
+                           uint32_t* const bgcolor) {
+  if (transparent_index != GIF_INDEX_INVALID &&
+      bgcolor_index == transparent_index) {
+    *bgcolor = GIF_TRANSPARENT_COLOR;  // Special case.
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || bgcolor_index >= color_map->ColorCount) {
+    *bgcolor = GIF_WHITE_COLOR;
+    fprintf(stderr,
+            "GIF decode warning: invalid background color index. Assuming "
+            "white background.\n");
+  } else {
+    const GifColorType color = color_map->Colors[bgcolor_index];
+    *bgcolor = (0xff        << 24)
+             | (color.Red   << 16)
+             | (color.Green <<  8)
+             | (color.Blue  <<  0);
+  }
+}
+
+int GIFReadGraphicsExtension(const GifByteType* const buf, int* const duration,
+                             GIFDisposeMethod* const dispose,
+                             int* const transparent_index) {
+  const int flags = buf[1];
+  const int dispose_raw = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
+  const int duration_raw = buf[2] | (buf[3] << 8);  // In 10 ms units.
+  if (buf[0] != 4) return 0;
+  *duration = duration_raw * 10;  // Duration is in 1 ms units.
+  switch (dispose_raw) {
+    case 3:
+      *dispose = GIF_DISPOSE_RESTORE_PREVIOUS;
+      break;
+    case 2:
+      *dispose = GIF_DISPOSE_BACKGROUND;
+      break;
+    case 1:
+    case 0:
+    default:
+      *dispose = GIF_DISPOSE_NONE;
+      break;
+  }
+  *transparent_index =
+      (flags & GIF_TRANSPARENT_MASK) ? buf[4] : GIF_INDEX_INVALID;
+  return 1;
+}
+
+static void Remap(const GifFileType* const gif, const uint8_t* const src,
+                  int len, int transparent_index, uint32_t* dst) {
+  int i;
+  const GifColorType* colors;
+  const ColorMapObject* const cmap =
+      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
+  if (cmap == NULL) return;
+  colors = cmap->Colors;
+
+  for (i = 0; i < len; ++i) {
+    const GifColorType c = colors[src[i]];
+    dst[i] = (src[i] == transparent_index) ? GIF_TRANSPARENT_COLOR
+           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
+  }
+}
+
+int GIFReadFrame(GifFileType* const gif, int transparent_index,
+                 GIFFrameRect* const gif_rect, WebPPicture* const picture) {
+  WebPPicture sub_image;
+  const GifImageDesc* const image_desc = &gif->Image;
+  uint32_t* dst = NULL;
+  uint8_t* tmp = NULL;
+  int ok = 0;
+  GIFFrameRect rect = {
+      image_desc->Left, image_desc->Top, image_desc->Width, image_desc->Height
+  };
+  *gif_rect = rect;
+
+  // Use a view for the sub-picture:
+  if (!WebPPictureView(picture, rect.x_offset, rect.y_offset,
+                       rect.width, rect.height, &sub_image)) {
+    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
+            rect.width, rect.height, rect.x_offset, rect.y_offset);
+    return 0;
+  }
+  dst = sub_image.argb;
+
+  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
+  if (tmp == NULL) goto End;
+
+  if (image_desc->Interlace) {  // Interlaced image.
+    // We need 4 passes, with the following offsets and jumps.
+    const int interlace_offsets[] = { 0, 4, 2, 1 };
+    const int interlace_jumps[]   = { 8, 8, 4, 2 };
+    int pass;
+    for (pass = 0; pass < 4; ++pass) {
+      int y;
+      for (y = interlace_offsets[pass]; y < rect.height;
+           y += interlace_jumps[pass]) {
+        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+        Remap(gif, tmp, rect.width, transparent_index,
+              dst + y * sub_image.argb_stride);
+      }
+    }
+  } else {  // Non-interlaced image.
+    int y;
+    for (y = 0; y < rect.height; ++y) {
+      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+      Remap(gif, tmp, rect.width, transparent_index,
+            dst + y * sub_image.argb_stride);
+    }
+  }
+  ok = 1;
+
+ End:
+  if (!ok) picture->error_code = sub_image.error_code;
+  WebPPictureFree(&sub_image);
+  free(tmp);
+  return ok;
+}
+
+int GIFReadLoopCount(GifFileType* const gif, GifByteType** const buf,
+                     int* const loop_count) {
+  assert(!memcmp(*buf + 1, "NETSCAPE2.0", 11) ||
+         !memcmp(*buf + 1, "ANIMEXTS1.0", 11));
+  if (DGifGetExtensionNext(gif, buf) == GIF_ERROR) {
+    return 0;
+  }
+  if (*buf == NULL) {
+    return 0;  // Loop count sub-block missing.
+  }
+  if ((*buf)[0] < 3 || (*buf)[1] != 1) {
+    return 0;   // wrong size/marker
+  }
+  *loop_count = (*buf)[2] | ((*buf)[3] << 8);
+  return 1;
+}
+
+int GIFReadMetadata(GifFileType* const gif, GifByteType** const buf,
+                    WebPData* const metadata) {
+  const int is_xmp = !memcmp(*buf + 1, "XMP DataXMP", 11);
+  const int is_icc = !memcmp(*buf + 1, "ICCRGBG1012", 11);
+  assert(is_xmp || is_icc);
+  (void)is_icc;  // silence unused warning.
+  // Construct metadata from sub-blocks.
+  // Usual case (including ICC profile): In each sub-block, the
+  // first byte specifies its size in bytes (0 to 255) and the
+  // rest of the bytes contain the data.
+  // Special case for XMP data: In each sub-block, the first byte
+  // is also part of the XMP payload. XMP in GIF also has a 257
+  // byte padding data. See the XMP specification for details.
+  while (1) {
+    WebPData subblock;
+    const uint8_t* tmp;
+    if (DGifGetExtensionNext(gif, buf) == GIF_ERROR) {
+      return 0;
+    }
+    if (*buf == NULL) break;  // Finished.
+    subblock.size = is_xmp ? (*buf)[0] + 1 : (*buf)[0];
+    assert(subblock.size > 0);
+    subblock.bytes = is_xmp ? *buf : *buf + 1;
+    // Note: We store returned value in 'tmp' first, to avoid
+    // leaking old memory in metadata->bytes on error.
+    tmp = (uint8_t*)realloc((void*)metadata->bytes,
+                            metadata->size + subblock.size);
+    if (tmp == NULL) {
+      return 0;
+    }
+    memcpy((void*)(tmp + metadata->size),
+           subblock.bytes, subblock.size);
+    metadata->bytes = tmp;
+    metadata->size += subblock.size;
+  }
+  if (is_xmp) {
+    // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
+    const size_t xmp_pading_size = 257;
+    if (metadata->size > xmp_pading_size) {
+      metadata->size -= xmp_pading_size;
+    }
+  }
+  return 1;
+}
+
+static void ClearRectangle(WebPPicture* const picture,
+                           int left, int top, int width, int height) {
+  int j;
+  for (j = top; j < top + height; ++j) {
+    uint32_t* const dst = picture->argb + j * picture->argb_stride;
+    int i;
+    for (i = left; i < left + width; ++i) {
+      dst[i] = GIF_TRANSPARENT_COLOR;
+    }
+  }
+}
+
+void GIFClearPic(WebPPicture* const pic, const GIFFrameRect* const rect) {
+  if (rect != NULL) {
+    ClearRectangle(pic, rect->x_offset, rect->y_offset,
+                   rect->width, rect->height);
+  } else {
+    ClearRectangle(pic, 0, 0, pic->width, pic->height);
+  }
+}
+
+void GIFCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
+  WebPCopyPixels(src, dst);
+}
+
+void GIFDisposeFrame(GIFDisposeMethod dispose, const GIFFrameRect* const rect,
+                     const WebPPicture* const prev_canvas,
+                     WebPPicture* const curr_canvas) {
+  assert(rect != NULL);
+  if (dispose == GIF_DISPOSE_BACKGROUND) {
+    GIFClearPic(curr_canvas, rect);
+  } else if (dispose == GIF_DISPOSE_RESTORE_PREVIOUS) {
+    const int src_stride = prev_canvas->argb_stride;
+    const uint32_t* const src =
+        prev_canvas->argb + rect->x_offset + rect->y_offset * src_stride;
+    const int dst_stride = curr_canvas->argb_stride;
+    uint32_t* const dst =
+        curr_canvas->argb + rect->x_offset + rect->y_offset * dst_stride;
+    assert(prev_canvas != NULL);
+    WebPCopyPlane((uint8_t*)src, 4 * src_stride, (uint8_t*)dst, 4 * dst_stride,
+                  4 * rect->width, rect->height);
+  }
+}
+
+void GIFBlendFrames(const WebPPicture* const src,
+                    const GIFFrameRect* const rect, WebPPicture* const dst) {
+  int j;
+  assert(src->width == dst->width && src->height == dst->height);
+  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
+    int i;
+    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
+      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
+      const int src_alpha = src_pixel >> 24;
+      if (src_alpha != 0) {
+        dst->argb[j * dst->argb_stride + i] = src_pixel;
+      }
+    }
+  }
+}
+
+void GIFDisplayError(const GifFileType* const gif, int gif_error) {
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if LOCAL_GIF_PREREQ(4,2)
+#if LOCAL_GIF_PREREQ(5,0)
+  // Static string actually, hence the const char* cast.
+  const char* error_str = (const char*)GifErrorString(
+      (gif == NULL) ? gif_error : gif->Error);
+#else
+  const char* error_str = (const char*)GifErrorString();
+  (void)gif;
+#endif
+  if (error_str == NULL) error_str = "Unknown error";
+  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+  (void)gif;
+  fprintf(stderr, "GIFLib Error %d: ", gif_error);
+  PrintGifError();
+  fprintf(stderr, "\n");
+#endif
+}
+
+#else  // !WEBP_HAVE_GIF
+
+static void ErrorGIFNotAvailable() {
+  fprintf(stderr, "GIF support not compiled. Please install the libgif-dev "
+          "package before building.\n");
+}
+
+void GIFGetBackgroundColor(const struct ColorMapObject* const color_map,
+                           int bgcolor_index, int transparent_index,
+                           uint32_t* const bgcolor) {
+  (void)color_map;
+  (void)bgcolor_index;
+  (void)transparent_index;
+  (void)bgcolor;
+  ErrorGIFNotAvailable();
+}
+
+int GIFReadGraphicsExtension(const GifByteType* const data, int* const duration,
+                             GIFDisposeMethod* const dispose,
+                             int* const transparent_index) {
+  (void)data;
+  (void)duration;
+  (void)dispose;
+  (void)transparent_index;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+int GIFReadFrame(struct GifFileType* const gif, int transparent_index,
+                 GIFFrameRect* const gif_rect,
+                 struct WebPPicture* const picture) {
+  (void)gif;
+  (void)transparent_index;
+  (void)gif_rect;
+  (void)picture;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+int GIFReadLoopCount(struct GifFileType* const gif, GifByteType** const buf,
+                     int* const loop_count) {
+  (void)gif;
+  (void)buf;
+  (void)loop_count;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+int GIFReadMetadata(struct GifFileType* const gif, GifByteType** const buf,
+                    struct WebPData* const metadata) {
+  (void)gif;
+  (void)buf;
+  (void)metadata;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+void GIFDisposeFrame(GIFDisposeMethod dispose, const GIFFrameRect* const rect,
+                     const struct WebPPicture* const prev_canvas,
+                     struct WebPPicture* const curr_canvas) {
+  (void)dispose;
+  (void)rect;
+  (void)prev_canvas;
+  (void)curr_canvas;
+  ErrorGIFNotAvailable();
+}
+
+void GIFBlendFrames(const struct WebPPicture* const src,
+                    const GIFFrameRect* const rect,
+                    struct WebPPicture* const dst) {
+  (void)src;
+  (void)rect;
+  (void)dst;
+  ErrorGIFNotAvailable();
+}
+
+void GIFDisplayError(const struct GifFileType* const gif, int gif_error) {
+  (void)gif;
+  (void)gif_error;
+  ErrorGIFNotAvailable();
+}
+
+void GIFClearPic(struct WebPPicture* const pic,
+                 const GIFFrameRect* const rect) {
+  (void)pic;
+  (void)rect;
+  ErrorGIFNotAvailable();
+}
+
+void GIFCopyPixels(const struct WebPPicture* const src,
+                   struct WebPPicture* const dst) {
+  (void)src;
+  (void)dst;
+  ErrorGIFNotAvailable();
+}
+
+#endif  // WEBP_HAVE_GIF
+
+// -----------------------------------------------------------------------------
--- a/examples/gifdec.h
+++ b/examples/gifdec.h
@ -0,0 +1,116 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// GIF decode.
+
+#ifndef WEBP_EXAMPLES_GIFDEC_H_
+#define WEBP_EXAMPLES_GIFDEC_H_
+
+#include <stdio.h>
+#include "webp/types.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#ifdef WEBP_HAVE_GIF
+#include <gif_lib.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
+# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
+# define LOCAL_GIF_PREREQ(maj, min) \
+    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GIF_VERSION 0
+# define LOCAL_GIF_PREREQ(maj, min) 0
+#endif
+
+#define GIF_INDEX_INVALID (-1)
+
+typedef enum GIFDisposeMethod {
+  GIF_DISPOSE_NONE,
+  GIF_DISPOSE_BACKGROUND,
+  GIF_DISPOSE_RESTORE_PREVIOUS
+} GIFDisposeMethod;
+
+typedef struct {
+  int x_offset, y_offset, width, height;
+} GIFFrameRect;
+
+struct WebPData;
+struct WebPPicture;
+
+#ifndef WEBP_HAVE_GIF
+struct ColorMapObject;
+struct GifFileType;
+typedef unsigned char GifByteType;
+#endif
+
+// Given the index of background color and transparent color, returns the
+// corresponding background color (in BGRA format) in 'bgcolor'.
+void GIFGetBackgroundColor(const struct ColorMapObject* const color_map,
+                           int bgcolor_index, int transparent_index,
+                           uint32_t* const bgcolor);
+
+// Parses the given graphics extension data to get frame duration (in 1ms
+// units), dispose method and transparent color index.
+// Returns true on success.
+int GIFReadGraphicsExtension(const GifByteType* const buf, int* const duration,
+                             GIFDisposeMethod* const dispose,
+                             int* const transparent_index);
+
+// Reads the next GIF frame from 'gif' into 'picture'. Also, returns the GIF
+// frame dimensions and offsets in 'rect'.
+// Returns true on success.
+int GIFReadFrame(struct GifFileType* const gif, int transparent_index,
+                 GIFFrameRect* const gif_rect,
+                 struct WebPPicture* const picture);
+
+// Parses loop count from the given Netscape extension data.
+int GIFReadLoopCount(struct GifFileType* const gif, GifByteType** const buf,
+                     int* const loop_count);
+
+// Parses the given ICC or XMP extension data and stores it into 'metadata'.
+// Returns true on success.
+int GIFReadMetadata(struct GifFileType* const gif, GifByteType** const buf,
+                    struct WebPData* const metadata);
+
+// Dispose the pixels within 'rect' of 'curr_canvas' based on 'dispose' method
+// and 'prev_canvas'.
+void GIFDisposeFrame(GIFDisposeMethod dispose, const GIFFrameRect* const rect,
+                     const struct WebPPicture* const prev_canvas,
+                     struct WebPPicture* const curr_canvas);
+
+// Given 'src' picture and its frame rectangle 'rect', blend it into 'dst'.
+void GIFBlendFrames(const struct WebPPicture* const src,
+                    const GIFFrameRect* const rect,
+                    struct WebPPicture* const dst);
+
+// Prints an error string based on 'gif_error'.
+void GIFDisplayError(const struct GifFileType* const gif, int gif_error);
+
+// In the given 'pic', clear the pixels in 'rect' to transparent color.
+void GIFClearPic(struct WebPPicture* const pic, const GIFFrameRect* const rect);
+
+// Copy pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are assumed
+// to be already allocated.
+void GIFCopyPixels(const struct WebPPicture* const src,
+                   struct WebPPicture* const dst);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_GIFDEC_H_
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@ -19,11 +19,13 @@

 #ifdef WEBP_HAVE_JPEG
 #include <jpeglib.h>
+#include <jerror.h>
 #include <setjmp.h>
 #include <stdlib.h>
 #include <string.h>

 #include "webp/encode.h"
+#include "./example_util.h"
 #include "./metadata.h"

 // -----------------------------------------------------------------------------
@ -208,13 +210,63 @@ static void my_error_exit(j_common_ptr dinfo) {
  longjmp(myerr->setjmp_buffer, 1);
 }

-int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
-  int ok = 0;
+typedef struct {
+  struct jpeg_source_mgr pub;
+  const uint8_t* data;
+  size_t data_size;
+} JPEGReadContext;
+
+static void ContextInit(j_decompress_ptr cinfo) {
+  JPEGReadContext* const ctx = (JPEGReadContext*)cinfo->src;
+  ctx->pub.next_input_byte = ctx->data;
+  ctx->pub.bytes_in_buffer = ctx->data_size;
+}
+
+static int ContextFill(j_decompress_ptr cinfo) {
+  // we shouldn't get here.
+  ERREXIT(cinfo, JERR_FILE_READ);
+  return 0;
+}
+
+static void ContextSkip(j_decompress_ptr cinfo, long jump_size) {
+  JPEGReadContext* const ctx = (JPEGReadContext*)cinfo->src;
+  size_t jump = (size_t)jump_size;
+  if (jump > ctx->pub.bytes_in_buffer) {  // Don't overflow the buffer.
+    jump = ctx->pub.bytes_in_buffer;
+  }
+  ctx->pub.bytes_in_buffer -= jump;
+  ctx->pub.next_input_byte += jump;
+}
+
+static void ContextTerm(j_decompress_ptr cinfo) {
+  (void)cinfo;
+}
+
+static void ContextSetup(volatile struct jpeg_decompress_struct* const cinfo,
+                         JPEGReadContext* const ctx) {
+  cinfo->src = (struct jpeg_source_mgr*)ctx;
+  ctx->pub.init_source = ContextInit;
+  ctx->pub.fill_input_buffer = ContextFill;
+  ctx->pub.skip_input_data = ContextSkip;
+  ctx->pub.resync_to_restart = jpeg_resync_to_restart;
+  ctx->pub.term_source = ContextTerm;
+  ctx->pub.bytes_in_buffer = 0;
+  ctx->pub.next_input_byte = NULL;
+}
+
+int ReadJPEG(const uint8_t* const data, size_t data_size,
+             WebPPicture* const pic, Metadata* const metadata) {
+  volatile int ok = 0;
  int stride, width, height;
  volatile struct jpeg_decompress_struct dinfo;
  struct my_error_mgr jerr;
  uint8_t* volatile rgb = NULL;
  JSAMPROW buffer[1];
+  JPEGReadContext ctx;
+
+  memset(&ctx, 0, sizeof(ctx));
+  ctx.data = data;
+  ctx.data_size = data_size;

  memset((j_decompress_ptr)&dinfo, 0, sizeof(dinfo));   // for setjmp sanity
  dinfo.err = jpeg_std_error(&jerr.pub);
@ -228,7 +280,7 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  }

  jpeg_create_decompress((j_decompress_ptr)&dinfo);
-  jpeg_stdio_src((j_decompress_ptr)&dinfo, in_file);
+  ContextSetup(&dinfo, &ctx);
  if (metadata != NULL) SaveMetadataMarkers((j_decompress_ptr)&dinfo);
  jpeg_read_header((j_decompress_ptr)&dinfo, TRUE);

@ -272,7 +324,6 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  // WebP conversion.
  pic->width = width;
  pic->height = height;
-  pic->use_argb = 1;      // store raw RGB samples
  ok = WebPPictureImportRGB(pic, rgb, stride);
  if (!ok) goto Error;

@ -281,9 +332,11 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  return ok;
 }
 #else  // !WEBP_HAVE_JPEG
-int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+int ReadJPEG(const uint8_t* const data, size_t data_size,
+             struct WebPPicture* const pic,
             struct Metadata* const metadata) {
-  (void)in_file;
+  (void)data;
+  (void)data_size;
  (void)pic;
  (void)metadata;
  fprintf(stderr, "JPEG support not compiled. Please install the libjpeg "
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@ -22,11 +22,11 @@ extern "C" {
 struct Metadata;
 struct WebPPicture;

-// Reads a JPEG from 'in_file', returning the decoded output in 'pic'.
-// The output is RGB.
+// Reads a JPEG from 'data', returning the decoded output in 'pic'.
+// The output is RGB or YUV depending on pic->use_argb value.
 // Returns true on success.
-int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
-             struct Metadata* const metadata);
+int ReadJPEG(const uint8_t* const data, size_t data_size,
+             struct WebPPicture* const pic, struct Metadata* const metadata);

 #ifdef __cplusplus
 }    // extern "C"
--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@ -18,12 +18,14 @@
 #include <stdio.h>

 #ifdef WEBP_HAVE_PNG
+#include <assert.h>
 #include <png.h>
 #include <setjmp.h>   // note: this must be included *after* png.h
 #include <stdlib.h>
 #include <string.h>

 #include "webp/encode.h"
+#include "./example_util.h"
 #include "./metadata.h"

 static void PNGAPI error_function(png_structp png, png_const_charp error) {
@ -131,8 +133,8 @@ static int ExtractMetadataFromPNG(png_structp png,
  for (p = 0; p < 2; ++p)  {
    png_infop const info = (p == 0) ? head_info : end_info;
    png_textp text = NULL;
-    const int num = png_get_text(png, info, &text, NULL);
-    int i;
+    const png_uint_32 num = png_get_text(png, info, &text, NULL);
+    png_uint_32 i;
    // Look for EXIF / XMP metadata.
    for (i = 0; i < num; ++i, ++text) {
      int j;
@ -188,24 +190,40 @@ static int ExtractMetadataFromPNG(png_structp png,
  return 1;
 }

-int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
-            Metadata* const metadata) {
-  volatile png_structp png;
+typedef struct {
+  const uint8_t* data;
+  size_t data_size;
+  png_size_t offset;
+} PNGReadContext;
+
+static void ReadFunc(png_structp png_ptr, png_bytep data, png_size_t length) {
+  PNGReadContext* const ctx = (PNGReadContext*)png_get_io_ptr(png_ptr);
+  assert(ctx->offset + length <= ctx->data_size);
+  memcpy(data, ctx->data + ctx->offset, length);
+  ctx->offset += length;
+}
+
+int ReadPNG(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic,
+            int keep_alpha, struct Metadata* const metadata) {
+  volatile png_structp png = NULL;
  volatile png_infop info = NULL;
  volatile png_infop end_info = NULL;
+  PNGReadContext context = { NULL, 0, 0 };
  int color_type, bit_depth, interlaced;
  int has_alpha;
  int num_passes;
  int p;
-  int ok = 0;
+  volatile int ok = 0;
  png_uint_32 width, height, y;
-  int stride;
+  png_uint_32 stride;
  uint8_t* volatile rgb = NULL;

+  context.data = data;
+  context.data_size = data_size;
+
  png = png_create_read_struct(PNG_LIBPNG_VER_STRING, 0, 0, 0);
-  if (png == NULL) {
-    goto End;
-  }
+  if (png == NULL) goto End;

  png_set_error_fn(png, 0, error_function, NULL);
  if (setjmp(png_jmpbuf(png))) {
@ -219,7 +237,7 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  end_info = png_create_info_struct(png);
  if (end_info == NULL) goto Error;

-  png_init_io(png, in_file);
+  png_set_read_fn(png, &context, ReadFunc);
  png_read_info(png, info);
  if (!png_get_IHDR(png, info,
                    &width, &height, &bit_depth, &color_type, &interlaced,
@ -268,11 +286,10 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
    goto Error;
  }

-  pic->width = width;
-  pic->height = height;
-  pic->use_argb = 1;
-  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
-                 : WebPPictureImportRGB(pic, rgb, stride);
+  pic->width = (int)width;
+  pic->height = (int)height;
+  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, (int)stride)
+                 : WebPPictureImportRGB(pic, rgb, (int)stride);

  if (!ok) {
    goto Error;
@ -287,9 +304,11 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  return ok;
 }
 #else  // !WEBP_HAVE_PNG
-int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
-            struct Metadata* const metadata) {
-  (void)in_file;
+int ReadPNG(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic,
+            int keep_alpha, struct Metadata* const metadata) {
+  (void)data;
+  (void)data_size;
  (void)pic;
  (void)keep_alpha;
  (void)metadata;
--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@ -12,7 +12,7 @@
 #ifndef WEBP_EXAMPLES_PNGDEC_H_
 #define WEBP_EXAMPLES_PNGDEC_H_

-#include <stdio.h>
+#include "webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@ -21,12 +21,14 @@ extern "C" {
 struct Metadata;
 struct WebPPicture;

-// Reads a PNG from 'in_file', returning the decoded output in 'pic'.
+// Reads a PNG from 'data', returning the decoded output in 'pic'.
+// Output is RGBA or YUVA, depending on pic->use_argb value.
 // If 'keep_alpha' is true and the PNG has an alpha channel, the output is RGBA
-// otherwise it will be RGB.
+// or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
 // Returns true on success.
-int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
-            struct Metadata* const metadata);
+int ReadPNG(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic,
+            int keep_alpha, struct Metadata* const metadata);

 #ifdef __cplusplus
 }    // extern "C"
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@ -16,6 +16,7 @@
 #endif

 #include <stdio.h>
+#include <string.h>

 #ifdef WEBP_HAVE_TIFF
 #include <tiffio.h>
@ -63,17 +64,71 @@ static int ExtractMetadataFromTIFF(TIFF* const tif, Metadata* const metadata) {
  return 1;
 }

-int ReadTIFF(const char* const filename,
+// Ad-hoc structure to supply read-from-memory functionalities.
+typedef struct {
+  const uint8_t* data;
+  toff_t size;
+  toff_t pos;
+} MyData;
+
+static int MyClose(thandle_t opaque) {
+  (void)opaque;
+  return 0;
+}
+
+static toff_t MySize(thandle_t opaque) {
+  const MyData* const my_data = (MyData*)opaque;
+  return my_data->size;
+}
+
+static toff_t MySeek(thandle_t opaque, toff_t offset, int whence) {
+  MyData* const my_data = (MyData*)opaque;
+  offset += (whence == SEEK_CUR) ? my_data->pos
+          : (whence == SEEK_SET) ? 0
+          : my_data->size;
+  if (offset > my_data->size) return (toff_t)-1;
+  my_data->pos = offset;
+  return offset;
+}
+
+static int MyMapFile(thandle_t opaque, void** base, toff_t* size) {
+  (void)opaque;
+  (void)base;
+  (void)size;
+  return 0;
+}
+static void MyUnmapFile(thandle_t opaque, void* base, toff_t size) {
+  (void)opaque;
+  (void)base;
+  (void)size;
+}
+
+static tsize_t MyRead(thandle_t opaque, void* dst, tsize_t size) {
+  MyData* const my_data = (MyData*)opaque;
+  if (my_data->pos + size > my_data->size) {
+    size = my_data->size - my_data->pos;
+  }
+  if (size > 0) {
+    memcpy(dst, my_data->data + my_data->pos, size);
+    my_data->pos += size;
+  }
+  return size;
+}
+
+int ReadTIFF(const uint8_t* const data, size_t data_size,
             WebPPicture* const pic, int keep_alpha,
             Metadata* const metadata) {
-  TIFF* const tif = TIFFOpen(filename, "r");
+  MyData my_data = { data, (toff_t)data_size, 0 };
+  TIFF* const tif = TIFFClientOpen("Memory", "r", &my_data,
+                                   MyRead, MyRead, MySeek, MyClose,
+                                   MySize, MyMapFile, MyUnmapFile);
  uint32 width, height;
  uint32* raster;
  int ok = 0;
  tdir_t dircount;

  if (tif == NULL) {
-    fprintf(stderr, "Error! Cannot open TIFF file '%s'\n", filename);
+    fprintf(stderr, "Error! Cannot parse TIFF file\n");
    return 0;
  }

@ -87,7 +142,7 @@ int ReadTIFF(const char* const filename,
  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
-    return 0;
+    goto End;
  }
  raster = (uint32*)_TIFFmalloc(width * height * sizeof(*raster));
  if (raster != NULL) {
@ -100,7 +155,6 @@ int ReadTIFF(const char* const filename,
 #ifdef WORDS_BIGENDIAN
      TIFFSwabArrayOfLong(raster, width * height);
 #endif
-      pic->use_argb = 1;
      ok = keep_alpha
         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
@ -120,15 +174,16 @@ int ReadTIFF(const char* const filename,
      }
    }
  }
-
+ End:
  TIFFClose(tif);
  return ok;
 }
 #else  // !WEBP_HAVE_TIFF
-int ReadTIFF(const char* const filename,
+int ReadTIFF(const uint8_t* const data, size_t data_size,
             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata) {
-  (void)filename;
+  (void)data;
+  (void)data_size;
  (void)pic;
  (void)keep_alpha;
  (void)metadata;
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@ -12,6 +12,8 @@
 #ifndef WEBP_EXAMPLES_TIFFDEC_H_
 #define WEBP_EXAMPLES_TIFFDEC_H_

+#include "webp/types.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -19,11 +21,12 @@ extern "C" {
 struct Metadata;
 struct WebPPicture;

-// Reads a TIFF from 'filename', returning the decoded output in 'pic'.
+// Reads a TIFF from 'data', returning the decoded output in 'pic'.
+// Output is RGBA or YUVA, depending on pic->use_argb value.
 // If 'keep_alpha' is true and the TIFF has an alpha channel, the output is RGBA
-// otherwise it will be RGB.
+// or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
 // Returns true on success.
-int ReadTIFF(const char* const filename,
+int ReadTIFF(const uint8_t* const data, size_t data_size,
             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata);

--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -38,7 +38,7 @@

 #include "./example_util.h"

-#ifdef _MSC_VER
+#if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
 #endif

@ -308,19 +308,24 @@ static void HandleDisplay(void) {
    //              they will be incorrect if the window is resized.
    // glScissor() takes window coordinates (0,0 at bottom left).
    int window_x, window_y;
+    int frame_w, frame_h;
    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
      // Clear the previous frame rectangle.
      window_x = prev->x_offset;
      window_y = kParams.canvas_height - prev->y_offset - prev->height;
+      frame_w = prev->width;
+      frame_h = prev->height;
    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
      // We simulate no-blending behavior by first clearing the current frame
      // rectangle (to a checker-board) and then alpha-blending against it.
      window_x = curr->x_offset;
      window_y = kParams.canvas_height - curr->y_offset - curr->height;
+      frame_w = curr->width;
+      frame_h = curr->height;
    }
    glEnable(GL_SCISSOR_TEST);
    // Only update the requested area, not the whole canvas.
-    glScissor(window_x, window_y, prev->width, prev->height);
+    glScissor(window_x, window_y, frame_w, frame_h);

    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();
@ -387,9 +392,7 @@ static void Help(void) {
         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
         "  -nofilter .... disable in-loop filtering\n"
         "  -dither <int>  dithering strength (0..100), default=50\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -noalphadither disable alpha plane dithering\n"
-#endif
         "  -mt .......... use multi-threading\n"
         "  -info ........ print info\n"
         "  -h     ....... this help message\n"
@ -411,9 +414,7 @@ int main(int argc, char *argv[]) {
    return -1;
  }
  config->options.dithering_strength = 50;
-#if WEBP_DECODER_ABI_VERSION > 0x0204
  config->options.alpha_dithering_strength = 100;
-#endif
  kParams.use_color_profile = 1;

  for (c = 1; c < argc; ++c) {
@ -427,10 +428,8 @@ int main(int argc, char *argv[]) {
      config->options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
      config->options.bypass_filtering = 1;
-#if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-noalphadither")) {
      config->options.alpha_dithering_strength = 0;
-#endif
    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
      config->options.dithering_strength =
          ExUtilGetInt(argv[++c], 0, &parse_error);
@ -527,6 +526,12 @@ int main(int argc, char *argv[]) {
  WebPDemuxGetFrame(kParams.dmux, 0, curr);
  if (kParams.loop_count) ++kParams.loop_count;

+#if defined(__unix__) || defined(__CYGWIN__)
+  // Work around GLUT compositor bug.
+  // https://bugs.launchpad.net/ubuntu/+source/freeglut/+bug/369891
+  setenv("XLIB_SKIP_ARGB_VISUALS", "1", 1);
+#endif
+
  // Start display (and timer)
  glutInit(&argc, argv);
 #ifdef FREEGLUT
--- a/examples/webpdec.c
+++ b/examples/webpdec.c
@ -19,11 +19,10 @@
 #include "./example_util.h"
 #include "./metadata.h"

-int ReadWebP(const char* const in_file, WebPPicture* const pic,
+int ReadWebP(const uint8_t* const data, size_t data_size,
+             WebPPicture* const pic,
             int keep_alpha, Metadata* const metadata) {
  int ok = 0;
-  size_t data_size = 0;
-  const uint8_t* data = NULL;
  VP8StatusCode status = VP8_STATUS_OK;
  WebPDecoderConfig config;
  WebPDecBuffer* const output_buffer = &config.output;
@ -39,8 +38,15 @@ int ReadWebP(const char* const in_file, WebPPicture* const pic,
    return 0;
  }

-  if (ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
+  status = WebPGetFeatures(data, data_size, bitstream);
+  if (status != VP8_STATUS_OK) {
+    ExUtilPrintWebPError("input data", status);
+    return 0;
+  }
+  {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
+    // TODO(skal): use MODE_YUV(A), depending on the expected
+    // input pic->use_argb. This would save some conversion steps.
    output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;

    status = ExUtilDecodeWebP(data, data_size, 0, &config);
@ -49,17 +55,15 @@ int ReadWebP(const char* const in_file, WebPPicture* const pic,
      const int stride = output_buffer->u.RGBA.stride;
      pic->width = output_buffer->width;
      pic->height = output_buffer->height;
-      pic->use_argb = 1;
      ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
                     : WebPPictureImportRGB(pic, rgba, stride);
    }
  }

  if (status != VP8_STATUS_OK) {
-    ExUtilPrintWebPError(in_file, status);
+    ExUtilPrintWebPError("input data", status);
  }

-  free((void*)data);
  WebPFreeDecBuffer(output_buffer);
  return ok;
 }
--- a/examples/webpdec.h
+++ b/examples/webpdec.h
@ -12,6 +12,8 @@
 #ifndef WEBP_EXAMPLES_WEBPDEC_H_
 #define WEBP_EXAMPLES_WEBPDEC_H_

+#include "webp/types.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -20,10 +22,12 @@ struct Metadata;
 struct WebPPicture;

 // Reads a WebP from 'in_file', returning the decoded output in 'pic'.
-// If 'keep_alpha' is true and the WebP has an alpha channel, the output is
-// RGBA otherwise it will be RGB.
+// Output is RGBA or YUVA, depending on pic->use_argb value.
+// If 'keep_alpha' is true and the WebP has an alpha channel, the output is RGBA
+// or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
 // Returns true on success.
-int ReadWebP(const char* const in_file, struct WebPPicture* const pic,
+int ReadWebP(const uint8_t* const data, size_t data_size,
+             struct WebPPicture* const pic,
             int keep_alpha, struct Metadata* const metadata);

 #ifdef __cplusplus
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -182,13 +182,11 @@ static WebPMuxError DisplayInfo(const WebPMux* mux) {
  printf("Canvas size: %d x %d\n", width, height);

  err = WebPMuxGetFeatures(mux, &flag);
-#ifndef WEBP_EXPERIMENTAL_FEATURES
  if (flag & FRAGMENTS_FLAG) err = WEBP_MUX_INVALID_ARGUMENT;
-#endif
  RETURN_IF_ERROR("Failed to retrieve features\n");

  if (flag == 0) {
-    fprintf(stderr, "No features present.\n");
+    printf("No features present.\n");
    return err;
  }

@ -291,9 +289,6 @@ static void PrintHelp(void) {
  printf("Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT\n");
  printf("       webpmux -set SET_OPTIONS INPUT -o OUTPUT\n");
  printf("       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("       webpmux -frgm FRAGMENT_OPTIONS [-frgm...] -o OUTPUT\n");
-#endif
  printf("       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]"
         "\n");
  printf("               [-bgcolor BACKGROUND_COLOR] -o OUTPUT\n");
@ -307,9 +302,6 @@ static void PrintHelp(void) {
  printf("   icc       get ICC profile\n");
  printf("   exif      get EXIF metadata\n");
  printf("   xmp       get XMP metadata\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("   frgm n    get nth fragment\n");
-#endif
  printf("   frame n   get nth frame\n");

  printf("\n");
@ -329,16 +321,6 @@ static void PrintHelp(void) {
  printf("   exif      strip EXIF metadata\n");
  printf("   xmp       strip XMP metadata\n");

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("\n");
-  printf("FRAGMENT_OPTIONS(i):\n");
-  printf(" Create fragmented image:\n");
-  printf("   file_i +xi+yi\n");
-  printf("   where:    'file_i' is the i'th fragment (WebP format),\n");
-  printf("             'xi','yi' specify the image offset for this fragment"
-         "\n");
-#endif
-
  printf("\n");
  printf("FRAME_OPTIONS(i):\n");
  printf(" Create animation:\n");
@ -650,24 +632,6 @@ static int ParseCommandLine(int argc, const char* argv[],
        arg->params_ = argv[i + 1];
        ++feature_arg_index;
        i += 2;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      } else if (!strcmp(argv[i], "-frgm")) {
-        CHECK_NUM_ARGS_LESS(3, ErrParse);
-        if (ACTION_IS_NIL || config->action_type_ == ACTION_SET) {
-          config->action_type_ = ACTION_SET;
-        } else {
-          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
-        }
-        if (FEATURETYPE_IS_NIL || feature->type_ == FEATURE_FRGM) {
-          feature->type_ = FEATURE_FRGM;
-        } else {
-          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
-        }
-        arg->filename_ = argv[i + 1];
-        arg->params_ = argv[i + 2];
-        ++feature_arg_index;
-        i += 3;
-#endif
      } else if (!strcmp(argv[i], "-o")) {
        CHECK_NUM_ARGS_LESS(2, ErrParse);
        config->output_ = argv[i + 1];
@ -727,13 +691,8 @@ static int ParseCommandLine(int argc, const char* argv[],
        } else {
          ++i;
        }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      } else if ((!strcmp(argv[i], "frame") ||
-                  !strcmp(argv[i], "frgm")) &&
-#else
      } else if (!strcmp(argv[i], "frame") &&
-#endif
-                  (config->action_type_ == ACTION_GET)) {
+                 (config->action_type_ == ACTION_GET)) {
        CHECK_NUM_ARGS_LESS(2, ErrParse);
        feature->type_ = (!strcmp(argv[i], "frame")) ? FEATURE_ANMF :
            FEATURE_FRGM;
@ -831,7 +790,7 @@ static int GetFrameFragment(const WebPMux* mux,
                            const WebPMuxConfig* config, int is_frame) {
  WebPMuxError err = WEBP_MUX_OK;
  WebPMux* mux_single = NULL;
-  long num = 0;
+  int num = 0;
  int ok = 1;
  int parse_error = 0;
  const WebPChunkId id = is_frame ? WEBP_CHUNK_ANMF : WEBP_CHUNK_FRGM;
@ -847,7 +806,7 @@ static int GetFrameFragment(const WebPMux* mux,
  err = WebPMuxGetFrame(mux, num, &info);
  if (err == WEBP_MUX_OK && info.id != id) err = WEBP_MUX_NOT_FOUND;
  if (err != WEBP_MUX_OK) {
-    ERROR_GOTO3("ERROR (%s): Could not get frame %ld.\n",
+    ERROR_GOTO3("ERROR (%s): Could not get frame %d.\n",
                ErrorString(err), num, ErrGet);
  }

--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -17,6 +17,7 @@

 #include <assert.h>
 #include <stdio.h>
+#include <string.h>

 #ifdef HAVE_WINCODEC_H
 #ifdef __MINGW32__
@ -26,11 +27,13 @@
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
                         // code with COBJMACROS.
+#include <ole2.h>  // CreateStreamOnHGlobal()
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>

 #include "webp/encode.h"
+#include "./example_util.h"
 #include "./metadata.h"

 #define IFS(fn)                                                     \
@ -73,10 +76,41 @@ WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
 WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
                 0xf5c7ad2d, 0x6a8d, 0x43dd,
                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppBGRA_,
+                 0x1562ff7c, 0xd352, 0x46f9,
+                 0x97, 0x9e, 0x42, 0x97, 0x6b, 0x79, 0x22, 0x46);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppRGBA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x16);

 static HRESULT OpenInputStream(const char* filename, IStream** stream) {
  HRESULT hr = S_OK;
-  IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  if (!strcmp(filename, "-")) {
+    const uint8_t* data = NULL;
+    size_t data_size = 0;
+    const int ok = ExUtilReadFile(filename, &data, &data_size);
+    if (ok) {
+      HGLOBAL image = GlobalAlloc(GMEM_MOVEABLE, data_size);
+      if (image != NULL) {
+        void* const image_mem = GlobalLock(image);
+        if (image_mem != NULL) {
+          memcpy(image_mem, data, data_size);
+          GlobalUnlock(image);
+          IFS(CreateStreamOnHGlobal(image, TRUE, stream));
+        } else {
+          hr = E_FAIL;
+        }
+      } else {
+        hr = E_OUTOFMEMORY;
+      }
+      free((void*)data);
+    } else {
+      hr = E_FAIL;
+    }
+  } else {
+    IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  }
+
  if (FAILED(hr)) {
    fprintf(stderr, "Error opening input file %s (%08lx)\n", filename, hr);
  }
@ -196,7 +230,11 @@ static int HasAlpha(IWICImagingFactory* const factory,
    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
                IsEqualGUID(MAKE_REFGUID(pixel_format),
-                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppBGRA_));
  }
  return has_alpha;
 }
@ -310,7 +348,7 @@ int ReadPictureWithWIC(const char* const filename,
    int ok;
    pic->width = width;
    pic->height = height;
-    pic->use_argb = 1;
+    pic->use_argb = 1;    // For WIC, we always force to argb
    ok = importer->import(pic, rgb, stride);
    if (!ok) hr = E_FAIL;
  }
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@ -21,7 +21,7 @@ struct WebPPicture;

 // Reads an image from 'filename', returning the decoded output in 'pic'.
 // If 'keep_alpha' is true and the image has an alpha channel, the output is
-// RGBA otherwise it will be RGB.
+// RGBA otherwise it will be RGB. pic->use_argb is always forced to true.
 // Returns true on success.
 int ReadPictureWithWIC(const char* const filename,
                       struct WebPPicture* const pic, int keep_alpha,
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -41,6 +41,8 @@ LIBLIST=''
 if [[ -z "${SDK}" ]]; then
  echo "iOS SDK not available"
  exit 1
+elif [[ ${SDK%%.*} -gt 8 ]]; then
+  EXTRA_CFLAGS="-fembed-bitcode"
 elif [[ ${SDK} < 6.0 ]]; then
  echo "You need iOS SDK version 6.0 or above"
  exit 1
@ -94,7 +96,7 @@ for PLATFORM in ${PLATFORMS}; do
  SDKROOT="${PLATFORMSROOT}/"
  SDKROOT+="${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
  CFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT} -O3 -DNDEBUG"
-  CFLAGS+=" -miphoneos-version-min=6.0"
+  CFLAGS+=" -miphoneos-version-min=6.0 ${EXTRA_CFLAGS}"

  set -x
  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
--- a/makefile.unix
+++ b/makefile.unix
@ -2,8 +2,8 @@
 # system, for simple local building of the libraries and tools.
 # It will not install the libraries system-wide, but just create the 'cwebp'
 # and 'dwebp' tools in the examples/ directory, along with the static
-# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a' and
-# 'src/demux/libwebpdemux.a'.
+# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a',
+# 'src/demux/libwebpdemux.a' and 'src/libwebpextras.a'.
 #
 # To build the library and examples, use:
 #    make -f makefile.unix
@ -61,6 +61,9 @@ endif
 EXTRA_FLAGS += -DWEBP_USE_THREAD
 EXTRA_LIBS += -lpthread

+# Control symbol visibility. Comment out if your compiler doesn't support it.
+EXTRA_FLAGS += -fvisibility=hidden
+
 # Extra flags to emulate C89 strictness with the full ANSI
 EXTRA_FLAGS += -Wextra -Wold-style-definition
 EXTRA_FLAGS += -Wmissing-prototypes
@ -68,9 +71,14 @@ EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
 EXTRA_FLAGS += -Wformat-security -Wformat-nonliteral
-
 # EXTRA_FLAGS += -Wvla

+# SSE4.1-specific flags:
+ifeq ($(HAVE_SSE41), 1)
+EXTRA_FLAGS += -DWEBP_HAVE_SSE41
+src/dsp/%_sse41.o: EXTRA_FLAGS += -msse4.1
+endif
+
 # AVX2-specific flags:
 ifeq ($(HAVE_AVX2), 1)
 EXTRA_FLAGS += -DWEBP_HAVE_AVX2
@ -85,14 +93,17 @@ endif

 AR = ar
 ARFLAGS = r
-CC = gcc
 CPPFLAGS = -Isrc/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
+CC = gcc
 INSTALL = install
 GROFF = /usr/bin/groff
 COL = /usr/bin/col
 LDFLAGS = $(EXTRA_LIBS) $(EXTRA_FLAGS) -lm

+ANIM_UTIL_OBJS = \
+    examples/anim_util.o \
+
 DEC_OBJS = \
    src/dec/alpha.o \
    src/dec/buffer.o \
@ -106,34 +117,64 @@ DEC_OBJS = \
    src/dec/webp.o \

 DEMUX_OBJS = \
+    src/demux/anim_decode.o \
    src/demux/demux.o \

 DSP_DEC_OBJS = \
    src/dsp/alpha_processing.o \
+    src/dsp/alpha_processing_mips_dsp_r2.o \
    src/dsp/alpha_processing_sse2.o \
+    src/dsp/alpha_processing_sse41.o \
    src/dsp/cpu.o \
    src/dsp/dec.o \
    src/dsp/dec_clip_tables.o \
    src/dsp/dec_mips32.o \
+    src/dsp/dec_mips_dsp_r2.o \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
+    src/dsp/dec_sse41.o \
+    src/dsp/filters.o \
+    src/dsp/filters_mips_dsp_r2.o \
+    src/dsp/filters_sse2.o \
    src/dsp/lossless.o \
-    src/dsp/lossless_mips32.o \
+    src/dsp/lossless_mips_dsp_r2.o \
    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
+    src/dsp/rescaler.o \
+    src/dsp/rescaler_mips32.o \
+    src/dsp/rescaler_mips_dsp_r2.o \
+    src/dsp/rescaler_neon.o \
+    src/dsp/rescaler_sse2.o \
    src/dsp/upsampling.o \
+    src/dsp/upsampling_mips_dsp_r2.o \
    src/dsp/upsampling_neon.o \
    src/dsp/upsampling_sse2.o \
    src/dsp/yuv.o \
    src/dsp/yuv_mips32.o \
+    src/dsp/yuv_mips_dsp_r2.o \
    src/dsp/yuv_sse2.o \

 DSP_ENC_OBJS = \
+    src/dsp/argb.o \
+    src/dsp/argb_mips_dsp_r2.o \
+    src/dsp/argb_sse2.o \
+    src/dsp/cost.o \
+    src/dsp/cost_mips32.o \
+    src/dsp/cost_mips_dsp_r2.o \
+    src/dsp/cost_sse2.o \
    src/dsp/enc.o \
    src/dsp/enc_avx2.o \
    src/dsp/enc_mips32.o \
+    src/dsp/enc_mips_dsp_r2.o \
    src/dsp/enc_neon.o \
    src/dsp/enc_sse2.o \
+    src/dsp/enc_sse41.o \
+    src/dsp/lossless_enc.o \
+    src/dsp/lossless_enc_mips32.o \
+    src/dsp/lossless_enc_mips_dsp_r2.o \
+    src/dsp/lossless_enc_neon.o \
+    src/dsp/lossless_enc_sse2.o \
+    src/dsp/lossless_enc_sse41.o \

 ENC_OBJS = \
    src/enc/alpha.o \
@ -141,10 +182,12 @@ ENC_OBJS = \
    src/enc/backward_references.o \
    src/enc/config.o \
    src/enc/cost.o \
+    src/enc/delta_palettization.o \
    src/enc/filter.o \
    src/enc/frame.o \
    src/enc/histogram.o \
    src/enc/iterator.o \
+    src/enc/near_lossless.o \
    src/enc/picture.o \
    src/enc/picture_csp.o \
    src/enc/picture_psnr.o \
@ -167,10 +210,11 @@ EX_FORMAT_DEC_OBJS = \
 EX_UTIL_OBJS = \
    examples/example_util.o \

-GIF2WEBP_UTIL_OBJS = \
-    examples/gif2webp_util.o \
+GIFDEC_OBJS = \
+    examples/gifdec.o \

 MUX_OBJS = \
+    src/mux/anim_encode.o \
    src/mux/muxedit.o \
    src/mux/muxinternal.o \
    src/mux/muxread.o \
@ -191,11 +235,15 @@ UTILS_ENC_OBJS = \
    src/utils/huffman_encode.o \
    src/utils/quant_levels.o \

+EXTRA_OBJS = \
+    src/extras/extras.o \
+
 LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
 LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
               $(UTILS_ENC_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS)
 LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)
+LIBWEBPEXTRA_OBJS = $(EXTRA_OBJS)

 HDRS_INSTALLED = \
    src/webp/decode.h \
@ -207,24 +255,28 @@ HDRS_INSTALLED = \

 HDRS = \
    src/dec/alphai.h \
+    src/dec/common.h \
    src/dec/decode_vp8.h \
    src/dec/vp8i.h \
    src/dec/vp8li.h \
    src/dec/webpi.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
+    src/dsp/mips_macro.h \
    src/dsp/neon.h \
    src/dsp/yuv.h \
-    src/dsp/yuv_tables_sse2.h \
    src/enc/backward_references.h \
    src/enc/cost.h \
+    src/enc/delta_palettization.h \
    src/enc/histogram.h \
    src/enc/vp8enci.h \
    src/enc/vp8li.h \
    src/mux/muxi.h \
    src/utils/bit_reader.h \
+    src/utils/bit_reader_inl.h \
    src/utils/bit_writer.h \
    src/utils/color_cache.h \
+    src/utils/endian_inl.h \
    src/utils/filters.h \
    src/utils/huffman.h \
    src/utils/huffman_encode.h \
@ -238,18 +290,21 @@ HDRS = \
    $(HDRS_INSTALLED) \

 OUT_LIBS = examples/libexample_util.a src/libwebpdecoder.a src/libwebp.a
+EXTRA_LIB = src/libwebpextras.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
-EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux
+EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux \
+                 examples/anim_diff

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
  OUTPUT += $(EXTRA_EXAMPLES)
-  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
-  OUTPUT += examples/libgif2webp_util.a
+  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a $(EXTRA_LIB)
+  OUTPUT += examples/libgifdec.a examples/libanim_util.a
 endif

 ex: $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
+extras: $(EXTRA_LIB)

 $(EX_FORMAT_DEC_OBJS): %.o: %.h

@ -264,27 +319,35 @@ src/utils/bit_writer.o: src/utils/endian_inl.h
 %.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@

+examples/libanim_util.a: $(ANIM_UTIL_OBJS)
 examples/libexample_util.a: $(EX_UTIL_OBJS)
-examples/libgif2webp_util.a: $(GIF2WEBP_UTIL_OBJS)
+examples/libgifdec.a: $(GIFDEC_OBJS)
 src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
 src/libwebp.a: $(LIBWEBP_OBJS)
 src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
 src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)
+src/libwebpextras.a: $(LIBWEBPEXTRA_OBJS)

 %.a:
 	$(AR) $(ARFLAGS) $@ $^

+examples/anim_diff: examples/anim_diff.o $(ANIM_UTIL_OBJS) $(GIFDEC_OBJS)
 examples/cwebp: examples/cwebp.o $(EX_FORMAT_DEC_OBJS)
 examples/dwebp: examples/dwebp.o
-examples/gif2webp: examples/gif2webp.o
+examples/gif2webp: examples/gif2webp.o $(GIFDEC_OBJS)
 examples/vwebp: examples/vwebp.o
 examples/webpmux: examples/webpmux.o

+examples/anim_diff: examples/libanim_util.a examples/libgifdec.a
+examples/anim_diff: src/demux/libwebpdemux.a examples/libexample_util.a
+examples/anim_diff: src/libwebp.a
+examples/anim_diff: EXTRA_LIBS += $(GIF_LIBS)
+examples/anim_diff: EXTRA_FLAGS += -DWEBP_HAVE_GIF
 examples/cwebp: examples/libexample_util.a src/libwebp.a
 examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
 examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
 examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
-examples/gif2webp: examples/libexample_util.a examples/libgif2webp_util.a
+examples/gif2webp: examples/libexample_util.a examples/libgifdec.a
 examples/gif2webp: src/mux/libwebpmux.a src/libwebp.a
 examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
 examples/gif2webp: EXTRA_FLAGS += -DWEBP_HAVE_GIF
@ -324,23 +387,10 @@ clean:
              src/demux/*.o src/demux/*~ \
              src/dsp/*.o src/dsp/*~ \
              src/enc/*.o src/enc/*~ \
+              src/extras/*.o src/extras/*~ \
              src/mux/*.o src/mux/*~ \
              src/utils/*.o src/utils/*~ \
              src/webp/*~ man/*~ doc/*~ swig/*~ \

-superclean: clean
-	$(RM) -r .git *.log *.cache *~
-	$(RM) -r .deps */.deps */*/.deps
-	$(RM) -r .libs */.libs */*/.libs
-	$(RM) */*.lo */*/*.lo
-	$(RM) */*.la */*/*.la
-	$(RM) Makefile */Makefile */*/Makefile
-	$(RM) Makefile.in */Makefile.in */*/Makefile.in
-	$(RM) config.log autom4te.cache libtool config.h stamp-h1
-	$(RM) aclocal.m4 compile
-	$(RM) config.guess config.h.in config.sub config.status
-	$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
-	$(RM) m4/*
-
-.PHONY: all clean dist ex superclean
+.PHONY: all clean dist ex
 .SUFFIXES:
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "Oct 13, 2014"
+.TH CWEBP 1 "December 14, 2015"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -23,7 +23,7 @@ Using "\-" as output name will direct output to 'stdout'.
 .TP
 .BI \-\- " string
 Explicitly specify the input file. This option is useful if the input
-file starts with an '\-' for instance. This option must appear \fBlast\fP.
+file starts with a '\-' for instance. This option must appear \fBlast\fP.
 Any other options afterward will be ignored.
 .TP
 .B \-h, \-help
@ -35,6 +35,17 @@ A summary of all the possible options.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
+.B \-lossless
+Encode the image without any loss. For images with fully transparent area,
+the invisible pixel values (R/G/B or Y/U/V) will be preserved only if the
+\-exact option is used.
+.TP
+.BI \-near_lossless " int
+Use near\-lossless image preprocessing. This option adjusts pixel values
+to help compressibility, but has minimal impact on the visual quality.
+It triggers lossless compression mode automatically.
+Range is 0 (maximum preprocessing) to 100 (no preprocessing, the default).
+.TP
 .BI \-q " float
 Specify the compression factor for RGB channels between 0 and 100. The default
 is 75.
@ -42,46 +53,31 @@ is 75.
 In case of lossy compression (default), a small factor produces a smaller file
 with lower quality. Best quality is achieved by using a value of 100.
 .br
-In case of lossless compression (specified by the \-lossless option), a small
-factor enables faster compression speed, but produces a larger file. Maximum
-compression is achieved by using a value of 100.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .BI \-z " int
-.\" Switch on \fBlossless\fP compression mode with the specified level between 0
-.\" and 9, with level 0 being the fastest, 9 being the slowest. Fast mode
-.\" produces larger file size than slower ones. A good default is \-z 6.
-.\" This option is actually a shortcut for some predefined settings for quality
-.\" and method. If options \-q  or \-m are subsequently used, they will invalidate
-.\" the effect of this \-z option.
+In case of lossless compression (specified by the \fB\-lossless\fP option), a
+small factor enables faster compression speed, but produces a larger file.
+Maximum compression is achieved by using a value of 100.
+.TP
+.BI \-z " int
+Switch on \fBlossless\fP compression mode with the specified level between 0
+and 9, with level 0 being the fastest, 9 being the slowest. Fast mode
+produces larger file size than slower ones. A good default is \fB\-z 6\fP.
+This option is actually a shortcut for some predefined settings for quality
+and method. If options \fB\-q\fP  or \fB\-m\fP are subsequently used, they will
+invalidate the effect of this option.
 .TP
 .BI \-alpha_q " int
 Specify the compression factor for alpha compression between 0 and 100.
 Lossless compression of alpha is achieved using a value of 100, while the lower
 values result in a lossy compression. The default is 100.
 .TP
-.BI \-f " int
-Specify the strength of the deblocking filter, between 0 (no filtering)
-and 100 (maximum filtering). A value of 0 will turn off any filtering.
-Higher value will increase the strength of the filtering process applied
-after decoding the picture. The higher the value the smoother the picture will
-appear. Typical values are usually in the range of 20 to 50.
-.TP
 .BI \-preset " string
-Specify a set of pre-defined parameters to suit a particular type of
+Specify a set of pre\-defined parameters to suit a particular type of
 source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
 \fB\-preset\fP overwrites the other parameters' values (except the
 \fB\-q\fP one), this option should preferably appear first in the
 order of the arguments.
 .TP
-.BI \-sns " int
-Specify the amplitude of the spatial noise shaping. Spatial noise shaping
-(or \fBsns\fP for short) refers to a general collection of built-in algorithms
-used to decide which area of the picture should use relatively less bits,
-and where else to better transfer these bits. The possible range goes from
-0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
-.TP
 .BI \-m " int
 Specify the compression method to use. This parameter controls the
 trade off between encoding speed and the compressed file size and quality.
@ -91,14 +87,18 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result in faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
-.B \-jpeg_like
-Change the internal parameter mapping to better match the expected size
-of JPEG compression. This flag will generally produce an output file of
-similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
-with less visual distortion.
+.BI \-resize " width height
+Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect\-ratio.
+.TP
+.BI \-crop " x_position y_position width height
+Crop the source to a rectangle with top\-left corner at coordinates
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
 .TP
 .B \-mt
-Use multi-threading for encoding, if possible. This option is only effective
+Use multi\-threading for encoding, if possible. This option is only effective
 when using lossy compression on a source with a transparency channel.
 .TP
 .B \-low_memory
@ -109,50 +109,11 @@ different in size and distortion. This flag is only effective for methods
 some side effects on the bitstream: it forces certain bitstream features
 like number of partitions (forced to 1). Note that a more detailed report
 of bitstream size is printed by \fBcwebp\fP when using this option.
-.TP
-.B \-af
-Turns auto-filter on. This algorithm will spend additional time optimizing
-the filtering strength to reach a well-balanced quality.

-.SH ADDITIONAL OPTIONS
-More advanced options are:
-.TP
-.BI \-sharpness " int
-Specify the sharpness of the filtering (if used).
-Range is 0 (sharpest) to 7 (least sharp). Default is 0.
-.TP
-.B \-strong
-Use strong filtering (if filtering is being used thanks to the
-\fB\-f\fP option). Strong filtering is on by default.
-.TP
-.B \-nostrong
-Disable strong filtering (if filtering is being used thanks to the
-\fB\-f\fP option) and use simple filtering instead.
-.TP
-.BI \-segments " int
-Change the number of partitions to use during the segmentation of the
-sns algorithm. Segments should be in range 1 to 4. Default value is 4.
-This option has no effect for methods 3 and up, unless \fB\-low_memory\fP
-is used.
-.TP
-.BI \-partition_limit " int
-Degrade quality by limiting the number of bits used by some macroblocks.
-Range is 0 (no degradation, the default) to 100 (full degradation).
-Useful values are usually around 30-70 for moderately large images.
-In the VP8 format, the so-called control partition has a limit of 512k and
-is used to store the following information: whether the macroblock is skipped,
-which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
-mode, and finally the prediction modes to use for each of the sub-blocks.
-For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
-The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
-information can use up almost all these 4 bits (although the case is unlikely),
-which is problematic for very large images. The partition_limit factor controls
-how frequently the most bit-costly mode (intra 4x4) will be used. This is
-useful in case the 512k limit is reached and the following message is displayed:
-\fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
-If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
-should use less segments in order to save more header bits per macroblock.
-See the \fB-segments\fP option.
+.SS LOSSY OPTIONS
+These options are only effective when doing lossy encoding (the default, with
+or without alpha).
+
 .TP
 .BI \-size " int
 Specify a target size (in bytes) to try and reach for the compressed output.
@ -166,79 +127,81 @@ close as possible to this target.
 .TP
 .BI \-pass " int
 Set a maximum number of passes to use during the dichotomy used by
-options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
+options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10, default is 1.
 .TP
-.BI \-resize " width height
-Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
-If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
-the value will be calculated preserving the aspect-ratio.
+.B \-af
+Turns auto\-filter on. This algorithm will spend additional time optimizing
+the filtering strength to reach a well\-balanced quality.
 .TP
-.BI \-crop " x_position y_position width height
-Crop the source to a rectangle with top-left corner at coordinates
-(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
-This cropping area must be fully contained within the source rectangle.
-.TP
-.BI \-s " width height
-Specify that the input file actually consists of raw Y'CbCr samples following
-the ITU-R BT.601 recommendation, in 4:2:0 linear format.
-The luma plane has size \fBwidth\fP x \fBheight\fP.
-.TP
-.BI \-map " int
-Output additional ASCII-map of encoding information. Possible map values
-range from 1 to 6. This is only meant to help debugging.
-.TP
-.BI \-pre " int
-Specify some pre-processing steps. Using a value of '2' will trigger
-quality-dependent pseudo-random dithering during RGBA->YUVA conversion
-(lossy compression only).
-.TP
-.BI \-alpha_filter " string
-Specify the predictive filtering method for the alpha plane. One of 'none',
-\&'fast' or 'best', in increasing complexity and slowness order. Default is
-\&'fast'. Internally, alpha filtering is performed using four possible
-predictions (none, horizontal, vertical, gradient). The 'best' mode will try
-each mode in turn and pick the one which gives the smaller size. The 'fast'
-mode will just try to form an a-priori guess without testing all modes.
-.TP
-.BI \-alpha_method " int
-Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
-no compression, 1 uses WebP lossless format for compression. The default is 1.
-.TP
-.B \-alpha_cleanup
-Modify unseen RGB values under fully transparent area, to help compressibility.
-The default is off.
-.TP
-.BI \-blend_alpha " int
-This option blends the alpha channel (if present) with the source using the
-background color specified in hexadecimal as 0xrrggbb. The alpha channel is
-afterward reset to the opaque value 255.
-.TP
-.B \-noalpha
-Using this option will discard the alpha channel.
-.TP
-.B \-lossless
-Encode the image without any loss.
-.TP
-.BI \-hint " string
-Specify the hint about input image type. Possible values are:
-\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
-.TP
-.BI \-metadata " string
-A comma separated list of metadata to copy from the input to the output if
-present.
-Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
-The default is \fBnone\fP.
+.B \-jpeg_like
+Change the internal parameter mapping to better match the expected size
+of JPEG compression. This flag will generally produce an output file of
+similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
+with less visual distortion.

-Note: each input format may not support all combinations.
 .TP
-.B \-noasm
-Disable all assembly optimizations.
+Advanced options:
+
+.TP
+.BI \-f " int
+Specify the strength of the deblocking filter, between 0 (no filtering)
+and 100 (maximum filtering). A value of 0 will turn off any filtering.
+Higher value will increase the strength of the filtering process applied
+after decoding the picture. The higher the value the smoother the picture will
+appear. Typical values are usually in the range of 20 to 50.
+.TP
+.BI \-sharpness " int
+Specify the sharpness of the filtering (if used).
+Range is 0 (sharpest) to 7 (least sharp). Default is 0.
+.TP
+.B \-strong
+Use strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option). Strong filtering is on by default.
+.TP
+.B \-nostrong
+Disable strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option) and use simple filtering instead.
+.TP
+.BI \-sns " int
+Specify the amplitude of the spatial noise shaping. Spatial noise shaping
+(or \fBsns\fP for short) refers to a general collection of built\-in algorithms
+used to decide which area of the picture should use relatively less bits,
+and where else to better transfer these bits. The possible range goes from
+0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
+.TP
+.BI \-segments " int
+Change the number of partitions to use during the segmentation of the
+sns algorithm. Segments should be in range 1 to 4. Default value is 4.
+This option has no effect for methods 3 and up, unless \fB\-low_memory\fP
+is used.
+.TP
+.BI \-partition_limit " int
+Degrade quality by limiting the number of bits used by some macroblocks.
+Range is 0 (no degradation, the default) to 100 (full degradation).
+Useful values are usually around 30\-70 for moderately large images.
+In the VP8 format, the so\-called control partition has a limit of 512k and
+is used to store the following information: whether the macroblock is skipped,
+which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
+mode, and finally the prediction modes to use for each of the sub\-blocks.
+For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
+The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
+information can use up almost all these 4 bits (although the case is unlikely),
+which is problematic for very large images. The partition_limit factor controls
+how frequently the most bit\-costly mode (intra 4x4) will be used. This is
+useful in case the 512k limit is reached and the following message is displayed:
+\fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
+If using \fB\-partition_limit\fP is not enough to meet the 512k constraint, one
+should use less segments in order to save more header bits per macroblock.
+See the \fB\-segments\fP option.
+
+.SS LOGGING OPTIONS
+These options control the level of output:
 .TP
 .B \-v
 Print extra information (encoding time in particular).
 .TP
 .B \-print_psnr
-Compute and report average PSNR (Peak-Signal-To-Noise ratio).
+Compute and report average PSNR (Peak\-Signal\-To\-Noise ratio).
 .TP
 .B \-print_ssim
 Compute and report average SSIM (structural similarity
@ -256,13 +219,69 @@ Do not print anything.
 .TP
 .B \-short
 Only print brief information (output file size and PSNR) for testing purpose.
+.TP
+.BI \-map " int
+Output additional ASCII\-map of encoding information. Possible map values
+range from 1 to 6. This is only meant to help debugging.
+
+.SS ADDITIONAL OPTIONS
+More advanced options are:
+.TP
+.BI \-s " width height
+Specify that the input file actually consists of raw Y'CbCr samples following
+the ITU\-R BT.601 recommendation, in 4:2:0 linear format.
+The luma plane has size \fBwidth\fP x \fBheight\fP.
+.TP
+.BI \-pre " int
+Specify some preprocessing steps. Using a value of '2' will trigger
+quality\-dependent pseudo\-random dithering during RGBA\->YUVA conversion
+(lossy compression only).
+.TP
+.BI \-alpha_filter " string
+Specify the predictive filtering method for the alpha plane. One of 'none',
+\&'fast' or 'best', in increasing complexity and slowness order. Default is
+\&'fast'. Internally, alpha filtering is performed using four possible
+predictions (none, horizontal, vertical, gradient). The 'best' mode will try
+each mode in turn and pick the one which gives the smaller size. The 'fast'
+mode will just try to form an a priori guess without testing all modes.
+.TP
+.BI \-alpha_method " int
+Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
+no compression, 1 uses WebP lossless format for compression. The default is 1.
+.TP
+.B \-exact
+Preserve RGB values in transparent area. The default is off, to help
+compressibility.
+.TP
+.BI \-blend_alpha " int
+This option blends the alpha channel (if present) with the source using the
+background color specified in hexadecimal as 0xrrggbb. The alpha channel is
+afterward reset to the opaque value 255.
+.TP
+.B \-noalpha
+Using this option will discard the alpha channel.
+.TP
+.BI \-hint " string
+Specify the hint about input image type. Possible values are:
+\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBnone\fP.
+
+Note: each input format may not support all combinations.
+.TP
+.B \-noasm
+Disable all assembly optimizations.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
-http://www.webmproject.org/code/contribute/submitting-patches/
+http://www.webmproject.org/code/contribute/submitting\-patches/

 .SH EXAMPLES
 cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
@ -274,9 +293,10 @@ cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
 cwebp \-o picture.webp \-\- \-\-\-picture.png

 .SH AUTHORS
-\fBcwebp\fP was written by the WebP team.
+\fBcwebp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "July 22, 2014"
+.TH DWEBP 1 "December 11, 2015"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -67,12 +67,11 @@ but it will make the decoding faster.
 Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .BI \-alpha_dither
-.\" If the compressed file contains a transparency plane that was quantized
-.\" during compression, this flag will allow dithering the reconstructed plane
-.\" in order to generate smoother transparency gradients.
+.TP
+.BI \-alpha_dither
+If the compressed file contains a transparency plane that was quantized
+during compression, this flag will allow dithering the reconstructed plane
+in order to generate smoother transparency gradients.
 .TP
 .B \-nodither
 Disable all dithering (default).
@ -87,16 +86,20 @@ This cropping area must be fully contained within the source rectangle.
 The top-left corner will be snapped to even coordinates if needed.
 This option is meant to reduce the memory needed for cropping large images.
 Note: the cropping is applied \fIbefore\fP any scaling.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .B \-flip
-.\" Flip decoded image vertically (can be useful for OpenGL textures for instance).
 .TP
-.BI \-scale " width height
+.B \-flip
+Flip decoded image vertically (can be useful for OpenGL textures for instance).
+.TP
+\fB\-resize\fR, \fB\-scale\fI width height\fR
 Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This
 option is mostly intended to reducing the memory needed to decode large images,
-when only a small version is needed (thumbnail, preview, etc.).  Note: scaling
+when only a small version is needed (thumbnail, preview, etc.). Note: scaling
 is applied \fIafter\fP cropping.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect-ratio.
+.TP
+.B \-quiet
+Do not print anything.
 .TP
 .B \-v
 Print extra information (decoding time in particular).
@ -106,7 +109,7 @@ Disable all assembly optimizations.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
@ -121,9 +124,10 @@ dwebp \-o output.ppm \-\- \-\-\-picture.webp
 cat picture.webp | dwebp \-o \- \-\- \- > output.ppm

 .SH AUTHORS
-\fBdwebp\fP was written by the WebP team.
+\fBdwebp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "March 7, 2014"
+.TH GIF2WEBP 1 "December 11, 2015"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -54,6 +54,12 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result is faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
+.BI \-min_size
+Encode image to achieve smallest size. This disables key frame insertion and
+picks the dispose method resulting in smallest output for each frame. It uses
+lossless compression by default, but can be combined with \-q, \-m, \-lossy or
+\-mixed options.
+.TP
 .BI \-kmin " int
 .TP
 .BI \-kmax " int
@ -62,7 +68,8 @@ Specify the minimum and maximum distance between consecutive key frames
 some key frames into the output animation as needed so that this criteria is
 satisfied.
 .br
-A 'kmin' value of 0 will turn off insertion of key frames.
+A 'kmin' value of 0 will turn off insertion of key frames. A 'kmax' value of 0
+will result in all frames being key frames.
 Typical values are in the range 3 to 30. Default values are kmin = 9,
 kmax = 17 for lossless compression and kmin = 3, kmax = 5 for lossy compression.
 .br
@ -111,7 +118,7 @@ Do not print anything.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
@ -128,9 +135,10 @@ gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
 gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif

 .SH AUTHORS
-\fBgif2webp\fP was written by the WebP team.
+\fBgif2webp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Urvang Joshi <urvang@google.com>, for the
 Debian project (and may be used by others).
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH VWEBP 1 "July 23, 2014"
+.TH VWEBP 1 "December 11, 2015"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
@ -33,11 +33,10 @@ Disable in-loop filtering.
 Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts. Default: 50.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .BI \-noalphadither
-.\" By default, quantized transparency planes are dithered during decompression,
-.\" to smooth the gradients. This flag will prevent this dithering.
+.TP
+.BI \-noalphadither
+By default, quantized transparency planes are dithered during decompression,
+to smooth the gradients. This flag will prevent this dithering.
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
@ -65,7 +64,7 @@ Quit.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
@ -78,9 +77,10 @@ vwebp picture.webp -mt -dither 0
 vwebp \-\- \-\-\-picture.webp

 .SH AUTHORS
-\fBvwebp\fP was written by the WebP team.
+\fBvwebp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written for the Debian project (and may be used by others).

--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "August 28, 2014"
+.TH WEBPMUX 1 "December 11, 2015"
 .SH NAME
 webpmux \- create animated WebP files from non\-animated WebP images, extract
 frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
@ -129,7 +129,7 @@ The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting\-patches/
@ -195,9 +195,10 @@ webpmux \-get icc \-o image_profile.icc \-\- \-\-\-icc_container.webp
 webpmux \-strip icc \-o without_icc.webp \-\- \-\-\-icc_container.webp

 .SH AUTHORS
-\fBwebpmux\fP is written by the WebP team.
+\fBwebpmux\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
 for the Debian project (and may be used by others).
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -1,5 +1,5 @@
-# The mux and demux libraries depend on libwebp, thus the '.' to force the
-# build order so it's available to them.
+# The mux, demux and extras libraries depend on libwebp, thus the '.' to force
+# the build order so it's available to them.
 SUBDIRS = dec enc dsp utils .
 if WANT_MUX
  SUBDIRS += mux
@ -7,6 +7,9 @@ endif
 if WANT_DEMUX
  SUBDIRS += demux
 endif
+if WANT_EXTRAS
+  SUBDIRS += extras
+endif

 lib_LTLIBRARIES = libwebp.la

@ -35,7 +38,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 5:3:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 6:0:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -47,7 +50,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:3:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 2:0:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -4,6 +4,7 @@ libwebpdecode_la_SOURCES =
 libwebpdecode_la_SOURCES += alpha.c
 libwebpdecode_la_SOURCES += alphai.h
 libwebpdecode_la_SOURCES += buffer.c
+libwebpdecode_la_SOURCES += common.h
 libwebpdecode_la_SOURCES += decode_vp8.h
 libwebpdecode_la_SOURCES += frame.c
 libwebpdecode_la_SOURCES += idec.c
@ -23,5 +24,5 @@ libwebpdecodeinclude_HEADERS += ../webp/types.h
 noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

-libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebpdecode_la_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 libwebpdecodeincludedir = $(includedir)/webp
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@ -15,6 +15,7 @@
 #include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
+#include "../dsp/dsp.h"
 #include "../utils/quant_levels_dec.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
@ -78,6 +79,7 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
  }
+  VP8FiltersInit();
  return ok;
 }

--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -33,6 +33,11 @@ static int IsValidColorspace(int webp_csp_mode) {
  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }

+// strictly speaking, the very last (or first, if flipped) row
+// doesn't require padding.
+#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
  const WEBP_CSP_MODE mode = buffer->colorspace;
@ -42,20 +47,22 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    ok = 0;
  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int uv_width  = (width  + 1) / 2;
+    const int uv_height = (height + 1) / 2;
    const int y_stride = abs(buf->y_stride);
    const int u_stride = abs(buf->u_stride);
    const int v_stride = abs(buf->v_stride);
    const int a_stride = abs(buf->a_stride);
-    const uint64_t y_size = (uint64_t)y_stride * height;
-    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)a_stride * height;
+    const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
+    const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
+    const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
+    const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
    ok &= (y_stride >= width);
-    ok &= (u_stride >= (width + 1) / 2);
-    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (u_stride >= uv_width);
+    ok &= (v_stride >= uv_width);
    ok &= (buf->y != NULL);
    ok &= (buf->u != NULL);
    ok &= (buf->v != NULL);
@ -67,13 +74,14 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
-    const uint64_t size = (uint64_t)stride * height;
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
+#undef MIN_BUFFER_SIZE

 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  const int w = buffer->width;
@ -181,11 +189,14 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
      h = ch;
    }
    if (options->use_scaling) {
-      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      int scaled_width = options->scaled_width;
+      int scaled_height = options->scaled_height;
+      if (!WebPRescalerGetScaledDimensions(
+              w, h, &scaled_width, &scaled_height)) {
        return VP8_STATUS_INVALID_PARAM;
      }
-      w = options->scaled_width;
-      h = options->scaled_height;
+      w = scaled_width;
+      h = scaled_height;
    }
  }
  out->width = w;
@ -195,12 +206,10 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
  status = AllocateBuffer(out);
  if (status != VP8_STATUS_OK) return status;

-#if WEBP_DECODER_ABI_VERSION > 0x0203
  // Use the stride trick if vertical flip is needed.
  if (options != NULL && options->flip) {
    status = WebPFlipBuffer(out);
  }
-#endif
  return status;
 }

--- a/src/dec/common.h
+++ b/src/dec/common.h
@ -0,0 +1,54 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Definitions and macros common to encoding and decoding
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DEC_COMMON_H_
+#define WEBP_DEC_COMMON_H_
+
+// intra prediction modes
+enum { B_DC_PRED = 0,   // 4x4 modes
+       B_TM_PRED = 1,
+       B_VE_PRED = 2,
+       B_HE_PRED = 3,
+       B_RD_PRED = 4,
+       B_VR_PRED = 5,
+       B_LD_PRED = 6,
+       B_VL_PRED = 7,
+       B_HD_PRED = 8,
+       B_HU_PRED = 9,
+       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
+
+       // Luma16 or UV modes
+       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
+       B_PRED = NUM_BMODES,   // refined I4x4 mode
+       NUM_PRED_MODES = 4,
+
+       // special modes
+       B_DC_PRED_NOTOP = 4,
+       B_DC_PRED_NOLEFT = 5,
+       B_DC_PRED_NOTOPLEFT = 6,
+       NUM_B_DC_MODES = 7 };
+
+enum { MB_FEATURE_TREE_PROBS = 3,
+       NUM_MB_SEGMENTS = 4,
+       NUM_REF_LF_DELTAS = 4,
+       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
+       MAX_NUM_PARTITIONS = 8,
+       // Probabilities
+       NUM_TYPES = 4,   // 0: i16-AC,  1: i16-DC,  2:chroma-AC,  3:i4-AC
+       NUM_BANDS = 8,
+       NUM_CTX = 3,
+       NUM_PROBAS = 11
+     };
+
+#endif    // WEBP_DEC_COMMON_H_
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -15,10 +15,180 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"

-#define ALIGN_MASK (32 - 1)
+//------------------------------------------------------------------------------
+// Main reconstruction function.
+
+static const int kScan[16] = {
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
+};
+
+static int CheckMode(int mb_x, int mb_y, int mode) {
+  if (mode == B_DC_PRED) {
+    if (mb_x == 0) {
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    } else {
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+    }
+  }
+  return mode;
+}
+
+static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
+  memcpy(dst, src, 4);
+}
+
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
+                                    uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
+}
+
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}

 static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+                           const VP8ThreadContext* ctx) {
+  int j;
+  int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
+  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
+  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
+  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+
+  // Initialize left-most block.
+  for (j = 0; j < 16; ++j) {
+    y_dst[j * BPS - 1] = 129;
+  }
+  for (j = 0; j < 8; ++j) {
+    u_dst[j * BPS - 1] = 129;
+    v_dst[j * BPS - 1] = 129;
+  }
+
+  // Init top-left sample on left column too.
+  if (mb_y > 0) {
+    y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+  } else {
+    // we only need to do this init once at block (0,0).
+    // Afterward, it remains valid for the whole topmost row.
+    memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+    memset(u_dst - BPS - 1, 127, 8 + 1);
+    memset(v_dst - BPS - 1, 127, 8 + 1);
+  }
+
+  // Reconstruct one row.
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;
+
+    // Rotate in the left samples from previously decoded block. We move four
+    // pixels at a time for alignment reason, and because of in-loop filter.
+    if (mb_x > 0) {
+      for (j = -1; j < 16; ++j) {
+        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+      }
+      for (j = -1; j < 8; ++j) {
+        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+      }
+    }
+    {
+      // bring top samples into the cache
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
+      int n;
+
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      }
+
+      // predict and add residuals
+      if (block->is_i4x4_) {   // 4x4
+        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
+          } else {
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
+          }
+        }
+        // replicate the top-right pixels below
+        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
+          uint8_t* const dst = y_dst + kScan[n];
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
+        }
+      } else {    // 16x16
+        const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]);
+        VP8PredLuma16[pred_func](y_dst);
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
+          }
+        }
+      }
+      {
+        // Chroma
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
+        VP8PredChroma8[pred_func](u_dst);
+        VP8PredChroma8[pred_func](v_dst);
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      }
+
+      // stash away top samples for next block
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      }
+    }
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      }
+    }
+  }
+}

 //------------------------------------------------------------------------------
 // Filtering
@ -112,7 +282,6 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
        int level = base_level;
        if (hdr->use_lf_delta_) {
-          // TODO(skal): only CURRENT is handled for now.
          level += hdr->ref_lf_delta_[0];
          if (i4x4) {
            level += hdr->mode_lf_delta_[0];
@ -177,7 +346,6 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
        dec->dither_ = 1;
      }
    }
-#if WEBP_DECODER_ABI_VERSION > 0x0204
    // potentially allow alpha dithering
    dec->alpha_dithering_ = options->alpha_dithering_strength;
    if (dec->alpha_dithering_ > 100) {
@ -185,7 +353,6 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
    } else if (dec->alpha_dithering_ < 0) {
      dec->alpha_dithering_ = 0;
    }
-#endif
  }
 }

@ -554,7 +721,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  const uint64_t needed = (uint64_t)intra_pred_mode_size
                        + top_size + mb_info_size + f_info_size
                        + yuv_size + mb_data_size
-                        + cache_size + alpha_size + ALIGN_MASK;
+                        + cache_size + alpha_size + WEBP_ALIGN_CST;
  uint8_t* mem;

  if (needed != (size_t)needed) return 0;  // check for overflow
@ -591,8 +758,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
    dec->thread_ctx_.f_info_ += mb_w;
  }

-  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
-  assert((yuv_size & ALIGN_MASK) == 0);
+  mem = (uint8_t*)WEBP_ALIGN(mem);
+  assert((yuv_size & WEBP_ALIGN_CST) == 0);
  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;

@ -644,7 +811,7 @@ static void InitIo(VP8Decoder* const dec, VP8Io* io) {
  io->a = NULL;
 }

-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) {
  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
  if (!AllocateMemory(dec)) return 0;
  InitIo(dec, io);
@ -653,176 +820,3 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
 }

 //------------------------------------------------------------------------------
-// Main reconstruction function.
-
-static const int kScan[16] = {
-  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
-  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
-  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
-  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
-};
-
-static int CheckMode(int mb_x, int mb_y, int mode) {
-  if (mode == B_DC_PRED) {
-    if (mb_x == 0) {
-      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
-    } else {
-      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
-    }
-  }
-  return mode;
-}
-
-static void Copy32b(uint8_t* dst, uint8_t* src) {
-  memcpy(dst, src, 4);
-}
-
-static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
-                                    uint8_t* const dst) {
-  switch (bits >> 30) {
-    case 3:
-      VP8Transform(src, dst, 0);
-      break;
-    case 2:
-      VP8TransformAC3(src, dst);
-      break;
-    case 1:
-      VP8TransformDC(src, dst);
-      break;
-    default:
-      break;
-  }
-}
-
-static void DoUVTransform(uint32_t bits, const int16_t* const src,
-                          uint8_t* const dst) {
-  if (bits & 0xff) {    // any non-zero coeff at all?
-    if (bits & 0xaa) {  // any non-zero AC coefficient?
-      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
-    } else {
-      VP8TransformDCUV(src, dst);
-    }
-  }
-}
-
-static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx) {
-  int j;
-  int mb_x;
-  const int mb_y = ctx->mb_y_;
-  const int cache_id = ctx->id_;
-  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
-  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
-  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
-  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-    const VP8MBData* const block = ctx->mb_data_ + mb_x;
-
-    // Rotate in the left samples from previously decoded block. We move four
-    // pixels at a time for alignment reason, and because of in-loop filter.
-    if (mb_x > 0) {
-      for (j = -1; j < 16; ++j) {
-        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
-      }
-      for (j = -1; j < 8; ++j) {
-        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
-      }
-    } else {
-      for (j = 0; j < 16; ++j) {
-        y_dst[j * BPS - 1] = 129;
-      }
-      for (j = 0; j < 8; ++j) {
-        u_dst[j * BPS - 1] = 129;
-        v_dst[j * BPS - 1] = 129;
-      }
-      // Init top-left sample on left column too
-      if (mb_y > 0) {
-        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
-      }
-    }
-    {
-      // bring top samples into the cache
-      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
-      const int16_t* const coeffs = block->coeffs_;
-      uint32_t bits = block->non_zero_y_;
-      int n;
-
-      if (mb_y > 0) {
-        memcpy(y_dst - BPS, top_yuv[0].y, 16);
-        memcpy(u_dst - BPS, top_yuv[0].u, 8);
-        memcpy(v_dst - BPS, top_yuv[0].v, 8);
-      } else if (mb_x == 0) {
-        // we only need to do this init once at block (0,0).
-        // Afterward, it remains valid for the whole topmost row.
-        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-        memset(u_dst - BPS - 1, 127, 8 + 1);
-        memset(v_dst - BPS - 1, 127, 8 + 1);
-      }
-
-      // predict and add residuals
-      if (block->is_i4x4_) {   // 4x4
-        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-
-        if (mb_y > 0) {
-          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
-            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
-          } else {
-            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
-          }
-        }
-        // replicate the top-right pixels below
-        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-
-        // predict and add residuals for all 4x4 blocks in turn.
-        for (n = 0; n < 16; ++n, bits <<= 2) {
-          uint8_t* const dst = y_dst + kScan[n];
-          VP8PredLuma4[block->imodes_[n]](dst);
-          DoTransform(bits, coeffs + n * 16, dst);
-        }
-      } else {    // 16x16
-        const int pred_func = CheckMode(mb_x, mb_y,
-                                        block->imodes_[0]);
-        VP8PredLuma16[pred_func](y_dst);
-        if (bits != 0) {
-          for (n = 0; n < 16; ++n, bits <<= 2) {
-            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
-          }
-        }
-      }
-      {
-        // Chroma
-        const uint32_t bits_uv = block->non_zero_uv_;
-        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
-        VP8PredChroma8[pred_func](u_dst);
-        VP8PredChroma8[pred_func](v_dst);
-        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
-        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
-      }
-
-      // stash away top samples for next block
-      if (mb_y < dec->mb_h_ - 1) {
-        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
-        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
-        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
-      }
-    }
-    // Transfer reconstructed samples from yuv_b_ cache to final destination.
-    {
-      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
-      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
-      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
-      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
-      for (j = 0; j < 16; ++j) {
-        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
-      }
-      for (j = 0; j < 8; ++j) {
-        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
-        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
-      }
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -130,8 +130,12 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
          VP8RemapBitReader(&dec->br_, offset);
        }
      }
-      assert(last_part >= 0);
-      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
+      {
+        const uint8_t* const last_start = dec->parts_[last_part].buf_;
+        assert(last_part >= 0);
+        VP8BitReaderSetBuffer(&dec->parts_[last_part], last_start,
+                              mem->buf_ + mem->end_ - last_start);
+      }
      if (NeedCompressedAlpha(idec)) {
        ALPHDecoder* const alph_dec = dec->alph_dec_;
        dec->alpha_data_ += offset;
@ -240,17 +244,15 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {

 // To be called last.
 static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
-#if WEBP_DECODER_ABI_VERSION > 0x0203
  const WebPDecoderOptions* const options = idec->params_.options;
  WebPDecBuffer* const output = idec->params_.output;

  idec->state_ = STATE_DONE;
  if (options != NULL && options->flip) {
    return WebPFlipBuffer(output);
+  } else {
+    return VP8_STATUS_OK;
  }
-#endif
-  idec->state_ = STATE_DONE;
-  return VP8_STATUS_OK;
 }

 //------------------------------------------------------------------------------
@ -377,8 +379,7 @@ static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
    }
    memcpy(part0_buf, br->buf_, part_size);
    mem->part0_buf_ = part0_buf;
-    br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + part_size;
+    VP8BitReaderSetBuffer(br, part0_buf, part_size);
  } else {
    // Else: just keep pointers to the partition #0's data in dec_->br_.
  }
@ -506,9 +507,15 @@ static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {

  // Wait until there's enough data for decoding header.
  if (curr_size < (idec->chunk_size_ >> 3)) {
-    return VP8_STATUS_SUSPENDED;
+    dec->status_ = VP8_STATUS_SUSPENDED;
+    return ErrorStatusLossless(idec, dec->status_);
  }
+
  if (!VP8LDecodeHeader(dec, io)) {
+    if (dec->status_ == VP8_STATUS_BITSTREAM_ERROR &&
+        curr_size < idec->chunk_size_) {
+      dec->status_ = VP8_STATUS_SUSPENDED;
+    }
    return ErrorStatusLossless(idec, dec->status_);
  }
  // Allocate/verify output buffer now.
@ -527,23 +534,15 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
  const size_t curr_size = MemDataSize(&idec->mem_);
  assert(idec->is_lossless_);

-  // At present Lossless decoder can't decode image incrementally. So wait till
-  // all the image data is aggregated before image can be decoded.
-  if (curr_size < idec->chunk_size_) {
-    return VP8_STATUS_SUSPENDED;
-  }
+  // Switch to incremental decoding if we don't have all the bytes available.
+  dec->incremental_ = (curr_size < idec->chunk_size_);

  if (!VP8LDecodeImage(dec)) {
-    // The decoding is called after all the data-bytes are aggregated. Change
-    // the error to VP8_BITSTREAM_ERROR in case lossless decoder fails to decode
-    // all the pixels (VP8_STATUS_SUSPENDED).
-    if (dec->status_ == VP8_STATUS_SUSPENDED) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-    }
    return ErrorStatusLossless(idec, dec->status_);
  }
-
-  return FinishDecoding(idec);
+  assert(dec->status_ == VP8_STATUS_OK || dec->status_ == VP8_STATUS_SUSPENDED);
+  return (dec->status_ == VP8_STATUS_SUSPENDED) ? dec->status_
+                                                : FinishDecoding(idec);
 }

  // Main decoding loop
@ -793,7 +792,6 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
  if (left != NULL) *left = 0;
  if (top != NULL) *top = 0;
-  // TODO(skal): later include handling of rotations.
  if (src) {
    if (width != NULL) *width = src->width;
    if (height != NULL) *height = idec->params_.last_y;
@ -859,4 +857,3 @@ int WebPISetIOHooks(WebPIDecoder* const idec,

  return 1;
 }
-
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -55,32 +55,6 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
  return io->mb_h;
 }

-//------------------------------------------------------------------------------
-// YUV444 -> RGB conversion
-
-#if 0   // TODO(skal): this is for future rescaling.
-static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
-  WebPDecBuffer* output = p->output;
-  const WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const uint8_t* y_src = io->y;
-  const uint8_t* u_src = io->u;
-  const uint8_t* v_src = io->v;
-  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
-  const int mb_w = io->mb_w;
-  const int last = io->mb_h;
-  int j;
-  for (j = 0; j < last; ++j) {
-    convert(y_src, u_src, v_src, dst, mb_w);
-    y_src += io->y_stride;
-    u_src += io->uv_stride;
-    v_src += io->uv_stride;
-    dst += buf->stride;
-  }
-  return io->mb_h;
-}
-#endif
-
 //------------------------------------------------------------------------------
 // Fancy upsampling

@ -145,14 +119,16 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {

 //------------------------------------------------------------------------------

-static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
+                        int expected_num_lines_out) {
  const uint8_t* alpha = io->a;
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
  int j;
-
+  (void)expected_num_lines_out;
+  assert(expected_num_lines_out == mb_h);
  if (alpha != NULL) {
    for (j = 0; j < mb_h; ++j) {
      memcpy(dst, alpha, mb_w * sizeof(*dst));
@ -195,7 +171,8 @@ static int GetAlphaSourceRow(const VP8Io* const io,
  return start_y;
 }

-static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
+                        int expected_num_lines_out) {
  const uint8_t* alpha = io->a;
  if (alpha != NULL) {
    const int mb_w = io->mb_w;
@ -206,21 +183,13 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
    int num_rows;
    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
-    uint32_t alpha_mask = 0xff;
-    int i, j;
-
-    for (j = 0; j < num_rows; ++j) {
-      for (i = 0; i < mb_w; ++i) {
-        const uint32_t alpha_value = alpha[i];
-        dst[4 * i] = alpha_value;
-        alpha_mask &= alpha_value;
-      }
-      alpha += io->width;
-      dst += buf->stride;
-    }
-    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
-    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
+    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
+                                            num_rows, dst, buf->stride);
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_rows);
+    // has_alpha is true if there's non-trivial alpha to premultiply with.
+    if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply(base_rgba, alpha_first,
                             mb_w, num_rows, buf->stride);
    }
@ -228,7 +197,8 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
  return 0;
 }

-static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
+                             int expected_num_lines_out) {
  const uint8_t* alpha = io->a;
  if (alpha != NULL) {
    const int mb_w = io->mb_w;
@ -244,7 +214,6 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
 #endif
    uint32_t alpha_mask = 0x0f;
    int i, j;
-
    for (j = 0; j < num_rows; ++j) {
      for (i = 0; i < mb_w; ++i) {
        // Fill in the alpha value (converted to 4 bits).
@ -255,6 +224,8 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
      alpha += io->width;
      alpha_dst += buf->stride;
    }
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_rows);
    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
    }
@ -296,12 +267,15 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
  return num_lines_out;
 }

-static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
+                                int expected_num_lines_out) {
  if (io->a != NULL) {
    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_lines_out);
    if (num_lines_out > 0) {   // unmultiply the Y
      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
                   p->scaler_a.dst_width, num_lines_out, 1);
@ -322,37 +296,31 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
  size_t tmp_size;
-  int32_t* work;
+  rescaler_t* work;

  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
  if (has_alpha) {
    tmp_size += work_size * sizeof(*work);
  }
-  p->memory = WebPSafeCalloc(1ULL, tmp_size);
+  p->memory = WebPSafeMalloc(1ULL, tmp_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size + uv_work_size);
  p->emit = EmitRescaledYUV;

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + work_size + 2 * uv_work_size);
    p->emit_alpha = EmitRescaledAlphaYUV;
    WebPInitAlphaProcessing();
@ -367,17 +335,17 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
  const WebPYUV444Converter convert =
      WebPYUV444Converters[p->output->colorspace];
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* dst = buf->rgba + y_pos * buf->stride;
  int num_lines_out = 0;
  // For RGB rescaling, because of the YUV420, current scan position
  // U/V can be +1/-1 line from the Y one.  Hence the double test.
  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
         WebPRescalerHasPendingOutput(&p->scaler_u)) {
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y, 0);
-    WebPRescalerExportRow(&p->scaler_u, 0);
-    WebPRescalerExportRow(&p->scaler_v, 0);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
@ -395,55 +363,54 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
    const int y_lines_in =
        WebPRescalerImport(&p->scaler_y, mb_h - j,
                           io->y + j * io->y_stride, io->y_stride);
-    const int u_lines_in =
-        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
-                           io->u + uv_j * io->uv_stride, io->uv_stride);
-    const int v_lines_in =
-        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
-                           io->v + uv_j * io->uv_stride, io->uv_stride);
-    (void)v_lines_in;   // remove a gcc warning
-    assert(u_lines_in == v_lines_in);
    j += y_lines_in;
-    uv_j += u_lines_in;
-    num_lines_out += ExportRGB(p, num_lines_out);
+    if (WebPRescaleNeededLines(&p->scaler_u, uv_mb_h - uv_j)) {
+      const int u_lines_in =
+          WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                             io->u + uv_j * io->uv_stride, io->uv_stride);
+      const int v_lines_in =
+          WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      (void)v_lines_in;   // remove a gcc warning
+      assert(u_lines_in == v_lines_in);
+      uv_j += u_lines_in;
+    }
+    num_lines_out += ExportRGB(p, p->last_y + num_lines_out);
  }
  return num_lines_out;
 }

-static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int alpha_first =
      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
  int num_lines_out = 0;
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
-  uint32_t alpha_mask = 0xff;
+  uint32_t non_opaque = 0;
  const int width = p->scaler_a.dst_width;

-  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
-    int i;
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
-    for (i = 0; i < width; ++i) {
-      const uint32_t alpha_value = p->scaler_a.dst[i];
-      dst[4 * i] = alpha_value;
-      alpha_mask &= alpha_value;
-    }
+  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+         num_lines_out < max_lines_out) {
+    assert(y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
+    non_opaque |= WebPDispatchAlpha(p->scaler_a.dst, 0, width, 1, dst, 0);
    dst += buf->stride;
    ++num_lines_out;
  }
-  if (is_premult_alpha && alpha_mask != 0xff) {
+  if (is_premult_alpha && non_opaque) {
    WebPApplyAlphaMultiply(base_rgba, alpha_first,
                           width, num_lines_out, buf->stride);
  }
  return num_lines_out;
 }

-static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
+                               int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
 #ifdef WEBP_SWAP_16BIT_CSP
  uint8_t* alpha_dst = base_rgba;
 #else
@ -455,10 +422,11 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
  uint32_t alpha_mask = 0x0f;

-  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+         num_lines_out < max_lines_out) {
    int i;
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    assert(y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      // Fill in the alpha value (converted to 4 bits).
      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@ -474,15 +442,17 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  return num_lines_out;
 }

-static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
+                                int expected_num_out_lines) {
  if (io->a != NULL) {
    WebPRescaler* const scaler = &p->scaler_a;
-    int j = 0;
-    int pos = 0;
-    while (j < io->mb_h) {
-      j += WebPRescalerImport(scaler, io->mb_h - j,
-                              io->a + j * io->width, io->width);
-      pos += p->emit_alpha_row(p, pos);
+    int lines_left = expected_num_out_lines;
+    const int y_end = p->last_y + lines_left;
+    while (lines_left > 0) {
+      const int row_offset = scaler->src_y - io->mb_y;
+      WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
+                         io->a + row_offset * io->width, io->width);
+      lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
    }
  }
  return 0;
@ -495,7 +465,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const int uv_in_width  = (io->mb_w + 1) >> 1;
  const int uv_in_height = (io->mb_h + 1) >> 1;
  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
-  int32_t* work;  // rescalers work area
+  rescaler_t* work;  // rescalers work area
  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
  size_t tmp_size1, tmp_size2, total_size;

@ -506,30 +476,27 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size2 += out_width;
  }
  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeCalloc(1ULL, total_size);
+  p->memory = WebPSafeMalloc(1ULL, total_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  tmp = (uint8_t*)(work + tmp_size1);
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work + 0 * work_size);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 1 * work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 2 * work_size);
  p->emit = EmitRescaledRGB;
+  WebPInitYUV444Converters();

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + 3 * work_size);
    p->emit_alpha = EmitRescaledAlphaRGB;
    if (p->output->colorspace == MODE_RGBA_4444 ||
@ -569,6 +536,7 @@ static int CustomSetup(VP8Io* io) {
    }
  } else {
    if (is_rgb) {
+      WebPInitSamplers();
      p->emit = EmitSampledRGB;   // default
      if (io->fancy_upsampling) {
 #ifdef FANCY_UPSAMPLING
@ -583,8 +551,6 @@ static int CustomSetup(VP8Io* io) {
        p->emit = EmitFancyRGB;
        WebPInitUpsamplers();
 #endif
-      } else {
-        WebPInitSamplers();
      }
    } else {
      p->emit = EmitYUV;
@ -621,7 +587,7 @@ static int CustomPut(const VP8Io* io) {
  }
  num_lines_out = p->emit(io, p);
  if (p->emit_alpha != NULL) {
-    p->emit_alpha(io, p);
+    p->emit_alpha(io, p, num_lines_out);
  }
  p->last_y += num_lines_out;
  return 1;
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -494,6 +494,12 @@ static const uint8_t
 };

 // Paragraph 9.9
+
+static const int kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // extra entry as sentinel
+};
+
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
  int t, b, c, p;
@ -507,6 +513,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
        }
      }
    }
+    for (b = 0; b < 16 + 1; ++b) {
+      proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
+    }
  }
  dec->use_skip_proba_ = VP8Get(br);
  if (dec->use_skip_proba_) {
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -75,10 +75,7 @@ void VP8Delete(VP8Decoder* const dec) {

 int VP8SetError(VP8Decoder* const dec,
                VP8StatusCode error, const char* const msg) {
-  // TODO This check would be unnecessary if alpha decompression was separated
-  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
-  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
-  // failure.
+  // The oldest error reported takes precedence over the new one.
  if (dec->status_ == VP8_STATUS_OK) {
    dec->status_ = error;
    dec->error_msg_ = msg;
@ -193,25 +190,27 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
  const uint8_t* sz = buf;
  const uint8_t* buf_end = buf + size;
  const uint8_t* part_start;
-  int last_part;
-  int p;
+  size_t size_left = size;
+  size_t last_part;
+  size_t p;

  dec->num_parts_ = 1 << VP8GetValue(br, 2);
  last_part = dec->num_parts_ - 1;
-  part_start = buf + last_part * 3;
-  if (buf_end < part_start) {
+  if (size < 3 * last_part) {
    // we can't even read the sizes with sz[]! That's a failure.
    return VP8_STATUS_NOT_ENOUGH_DATA;
  }
+  part_start = buf + last_part * 3;
+  size_left -= last_part * 3;
  for (p = 0; p < last_part; ++p) {
-    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
-    const uint8_t* part_end = part_start + psize;
-    if (part_end > buf_end) part_end = buf_end;
-    VP8InitBitReader(dec->parts_ + p, part_start, part_end);
-    part_start = part_end;
+    size_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
+    if (psize > size_left) psize = size_left;
+    VP8InitBitReader(dec->parts_ + p, part_start, psize);
+    part_start += psize;
+    size_left -= psize;
    sz += 3;
  }
-  VP8InitBitReader(dec->parts_ + last_part, part_start, buf_end);
+  VP8InitBitReader(dec->parts_ + last_part, part_start, size_left);
  return (part_start < buf_end) ? VP8_STATUS_OK :
           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
 }
@ -328,7 +327,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  }

  br = &dec->br_;
-  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
+  VP8InitBitReader(br, buf, frm_hdr->partition_length_);
  buf += frm_hdr->partition_length_;
  buf_size -= frm_hdr->partition_length_;

@ -371,11 +370,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)

-static const int kBands[16 + 1] = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
-  0  // extra entry as sentinel
-};
-
 static const uint8_t kCat3[] = { 173, 148, 140, 0 };
 static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
@ -419,20 +413,19 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }

 // Returns the position of the last non-zero coeff plus one
-static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
                     int ctx, const quant_t dq, int n, int16_t* out) {
-  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n].probas_[ctx];
+  const uint8_t* p = prob[n]->probas_[ctx];
  for (; n < 16; ++n) {
    if (!VP8GetBit(br, p[0])) {
      return n;  // previous coeff was last non-zero coeff
    }
    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
-      p = prob[kBands[++n]].probas_[0];
+      p = prob[++n]->probas_[0];
      if (n == 16) return 16;
    }
    {        // non zero coeff
-      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
+      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
      int v;
      if (!VP8GetBit(br, p[2])) {
        v = 1;
@ -455,8 +448,8 @@ static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {

 static int ParseResiduals(VP8Decoder* const dec,
                          VP8MB* const mb, VP8BitReader* const token_br) {
-  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
-  const VP8BandProbas* ac_proba;
+  const VP8BandProbas* (* const bands)[16 + 1] = dec->proba_.bands_ptr_;
+  const VP8BandProbas* const * ac_proba;
  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
  int16_t* dst = block->coeffs_;
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -15,6 +15,7 @@
 #define WEBP_DEC_VP8I_H_

 #include <string.h>     // for memcpy()
+#include "./common.h"
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
 #include "../utils/random.h"
@ -30,46 +31,10 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 3
+#define DEC_MIN_VERSION 5
+#define DEC_REV_VERSION 0

-// intra prediction modes
-enum { B_DC_PRED = 0,   // 4x4 modes
-       B_TM_PRED,
-       B_VE_PRED,
-       B_HE_PRED,
-       B_RD_PRED,
-       B_VR_PRED,
-       B_LD_PRED,
-       B_VL_PRED,
-       B_HD_PRED,
-       B_HU_PRED,
-       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
-
-       // Luma16 or UV modes
-       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       B_PRED = NUM_BMODES,   // refined I4x4 mode
-
-       // special modes
-       B_DC_PRED_NOTOP = 4,
-       B_DC_PRED_NOLEFT = 5,
-       B_DC_PRED_NOTOPLEFT = 6,
-       NUM_B_DC_MODES = 7 };
-
-enum { MB_FEATURE_TREE_PROBS = 3,
-       NUM_MB_SEGMENTS = 4,
-       NUM_REF_LF_DELTAS = 4,
-       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
-       MAX_NUM_PARTITIONS = 8,
-       // Probabilities
-       NUM_TYPES = 4,
-       NUM_BANDS = 8,
-       NUM_CTX = 3,
-       NUM_PROBAS = 11,
-       NUM_MV_PROBAS = 19 };
-
-// YUV-cache parameters.
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
 // and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
 // in order to be SIMD-friendly. We also need to store the top, left and
@ -91,8 +56,6 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 //  'y' = y-samples   'u' = u-samples     'v' = u-samples
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
-// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
-#define BPS       32    // this is the common stride used by yuv[]
 #define YUV_SIZE (BPS * 17 + BPS * 9)
 #define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
@ -130,7 +93,6 @@ typedef struct {
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;

-
 // probas associated to one of the contexts
 typedef uint8_t VP8ProbaArray[NUM_PROBAS];

@ -143,6 +105,7 @@ typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
+  const VP8BandProbas* bands_ptr_[NUM_TYPES][16 + 1];
 } VP8Proba;

 // Filter parameters
@ -317,7 +280,7 @@ int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@ -43,6 +43,7 @@ struct VP8LTransform {
 typedef struct {
  int             color_cache_size_;
  VP8LColorCache  color_cache_;
+  VP8LColorCache  saved_color_cache_;  // for incremental

  int             huffman_mask_;
  int             huffman_subsample_bits_;
@ -50,12 +51,12 @@ typedef struct {
  uint32_t       *huffman_image_;
  int             num_htree_groups_;
  HTreeGroup     *htree_groups_;
+  HuffmanCode    *huffman_tables_;
 } VP8LMetadata;

 typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
  VP8StatusCode    status_;
-  VP8LDecodeState  action_;
  VP8LDecodeState  state_;
  VP8Io           *io_;

@ -66,6 +67,9 @@ struct VP8LDecoder {
  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.

  VP8LBitReader    br_;
+  int              incremental_;   // if true, incremental decoding is expected
+  VP8LBitReader    saved_br_;      // note: could be local variables too
+  int              saved_last_pixel_;

  int              width_;
  int              height_;
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -16,6 +16,7 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
+#include "../utils/utils.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG

 //------------------------------------------------------------------------------
@ -43,14 +44,6 @@
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.

-static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16);
-}
-
-static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
-  return (uint32_t)get_le24(data) | (data[3] << 24);
-}
-
 // Validates the RIFF container (if detected) and skips over it.
 // If a RIFF container is detected, returns:
 //     VP8_STATUS_BITSTREAM_ERROR for invalid header,
@ -70,7 +63,7 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
    } else {
-      const uint32_t size = get_le32(*data + TAG_SIZE);
+      const uint32_t size = GetLE32(*data + TAG_SIZE);
      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
        return VP8_STATUS_BITSTREAM_ERROR;
@ -116,7 +109,7 @@ static VP8StatusCode ParseVP8X(const uint8_t** const data,
  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
    int width, height;
    uint32_t flags;
-    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
+    const uint32_t chunk_size = GetLE32(*data + TAG_SIZE);
    if (chunk_size != VP8X_CHUNK_SIZE) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
    }
@ -125,9 +118,9 @@ static VP8StatusCode ParseVP8X(const uint8_t** const data,
    if (*data_size < vp8x_size) {
      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
    }
-    flags = get_le32(*data + 8);
-    width = 1 + get_le24(*data + 12);
-    height = 1 + get_le24(*data + 15);
+    flags = GetLE32(*data + 8);
+    width = 1 + GetLE24(*data + 12);
+    height = 1 + GetLE24(*data + 15);
    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
    }
@ -181,7 +174,7 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }

-    chunk_size = get_le32(buf + TAG_SIZE);
+    chunk_size = GetLE32(buf + TAG_SIZE);
    if (chunk_size > MAX_CHUNK_PAYLOAD) {
      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }
@ -247,7 +240,7 @@ static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,

  if (is_vp8 || is_vp8l) {
    // Bitstream contains VP8/VP8L header.
-    const uint32_t size = get_le32(data + TAG_SIZE);
+    const uint32_t size = GetLE32(data + TAG_SIZE);
    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
    }
@ -521,11 +514,9 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
    WebPFreeDecBuffer(params->output);
  }

-#if WEBP_DECODER_ABI_VERSION > 0x0203
  if (params->options != NULL && params->options->flip) {
    status = WebPFlipBuffer(params->output);
  }
-#endif
  return status;
 }

@ -808,11 +799,13 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
  // Scaling
  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
  if (io->use_scaling) {
-    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+    int scaled_width = options->scaled_width;
+    int scaled_height = options->scaled_height;
+    if (!WebPRescalerGetScaledDimensions(w, h, &scaled_width, &scaled_height)) {
      return 0;
    }
-    io->scaled_width = options->scaled_width;
-    io->scaled_height = options->scaled_height;
+    io->scaled_width = scaled_width;
+    io->scaled_height = scaled_height;
  }

  // Filter
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@ -26,7 +26,10 @@ extern "C" {

 typedef struct WebPDecParams WebPDecParams;
 typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
-typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
+typedef int (*OutputAlphaFunc)(const VP8Io* const io, WebPDecParams* const p,
+                               int expected_num_out_lines);
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos,
+                             int max_out_lines);

 struct WebPDecParams {
  WebPDecBuffer* output;             // output buffer.
@ -40,7 +43,7 @@ struct WebPDecParams {
  void* memory;                  // overall scratch memory for the output work.

  OutputFunc emit;               // output RGB or YUV samples
-  OutputFunc emit_alpha;         // output alpha channel
+  OutputAlphaFunc emit_alpha;    // output alpha channel
  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
 };

--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -1,7 +1,7 @@
 lib_LTLIBRARIES = libwebpdemux.la

 libwebpdemux_la_SOURCES =
-libwebpdemux_la_SOURCES += demux.c
+libwebpdemux_la_SOURCES += anim_decode.c demux.c

 libwebpdemuxinclude_HEADERS =
 libwebpdemuxinclude_HEADERS += ../webp/demux.h
@ -9,6 +9,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:2:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:0:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/anim_decode.c
+++ b/src/demux/anim_decode.c
@ -0,0 +1,442 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  AnimDecoder implementation.
+//
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include <assert.h>
+#include <string.h>
+
+#include "../utils/utils.h"
+#include "../webp/decode.h"
+#include "../webp/demux.h"
+
+#define NUM_CHANNELS 4
+
+typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
+static void BlendPixelRowNonPremult(uint32_t* const src,
+                                    const uint32_t* const dst, int num_pixels);
+static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
+                                 int num_pixels);
+
+struct WebPAnimDecoder {
+  WebPDemuxer* demux_;             // Demuxer created from given WebP bitstream.
+  WebPDecoderConfig config_;       // Decoder config.
+  // Note: we use a pointer to a function blending multiple pixels at a time to
+  // allow possible inlining of per-pixel blending function.
+  BlendRowFunc blend_func_;        // Pointer to the chose blend row function.
+  WebPAnimInfo info_;              // Global info about the animation.
+  uint8_t* curr_frame_;            // Current canvas (not disposed).
+  uint8_t* prev_frame_disposed_;   // Previous canvas (properly disposed).
+  int prev_frame_timestamp_;       // Previous frame timestamp (milliseconds).
+  WebPIterator prev_iter_;         // Iterator object for previous frame.
+  int prev_frame_was_keyframe_;    // True if previous frame was a keyframe.
+  int next_frame_;                 // Index of the next frame to be decoded
+                                   // (starting from 1).
+};
+
+static void DefaultDecoderOptions(WebPAnimDecoderOptions* const dec_options) {
+  dec_options->color_mode = MODE_RGBA;
+  dec_options->use_threads = 0;
+}
+
+int WebPAnimDecoderOptionsInitInternal(WebPAnimDecoderOptions* dec_options,
+                                       int abi_version) {
+  if (dec_options == NULL ||
+      WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
+    return 0;
+  }
+  DefaultDecoderOptions(dec_options);
+  return 1;
+}
+
+static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
+                               WebPAnimDecoder* const dec) {
+  WEBP_CSP_MODE mode;
+  WebPDecoderConfig* config = &dec->config_;
+  assert(dec_options != NULL);
+
+  mode = dec_options->color_mode;
+  if (mode != MODE_RGBA && mode != MODE_BGRA &&
+      mode != MODE_rgbA && mode != MODE_bgrA) {
+    return 0;
+  }
+  dec->blend_func_ = (mode == MODE_RGBA || mode == MODE_BGRA)
+                         ? &BlendPixelRowNonPremult
+                         : &BlendPixelRowPremult;
+  WebPInitDecoderConfig(config);
+  config->output.colorspace = mode;
+  config->output.is_external_memory = 1;
+  config->options.use_threads = dec_options->use_threads;
+  // Note: config->output.u.RGBA is set at the time of decoding each frame.
+  return 1;
+}
+
+WebPAnimDecoder* WebPAnimDecoderNewInternal(
+    const WebPData* webp_data, const WebPAnimDecoderOptions* dec_options,
+    int abi_version) {
+  WebPAnimDecoderOptions options;
+  WebPAnimDecoder* dec = NULL;
+  if (webp_data == NULL ||
+      WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
+    return NULL;
+  }
+
+  // Note: calloc() so that the pointer members are initialized to NULL.
+  dec = (WebPAnimDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
+  if (dec == NULL) goto Error;
+
+  if (dec_options != NULL) {
+    options = *dec_options;
+  } else {
+    DefaultDecoderOptions(&options);
+  }
+  if (!ApplyDecoderOptions(&options, dec)) goto Error;
+
+  dec->demux_ = WebPDemux(webp_data);
+  if (dec->demux_ == NULL) goto Error;
+
+  dec->info_.canvas_width = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_WIDTH);
+  dec->info_.canvas_height = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_HEIGHT);
+  dec->info_.loop_count = WebPDemuxGetI(dec->demux_, WEBP_FF_LOOP_COUNT);
+  dec->info_.bgcolor = WebPDemuxGetI(dec->demux_, WEBP_FF_BACKGROUND_COLOR);
+  dec->info_.frame_count = WebPDemuxGetI(dec->demux_, WEBP_FF_FRAME_COUNT);
+
+  {
+    const int canvas_bytes =
+        dec->info_.canvas_width * NUM_CHANNELS * dec->info_.canvas_height;
+    // Note: calloc() because we fill frame with zeroes as well.
+    dec->curr_frame_ = WebPSafeCalloc(1ULL, canvas_bytes);
+    if (dec->curr_frame_ == NULL) goto Error;
+    dec->prev_frame_disposed_ = WebPSafeCalloc(1ULL, canvas_bytes);
+    if (dec->prev_frame_disposed_ == NULL) goto Error;
+  }
+
+  WebPAnimDecoderReset(dec);
+
+  return dec;
+
+ Error:
+  WebPAnimDecoderDelete(dec);
+  return NULL;
+}
+
+int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, WebPAnimInfo* info) {
+  if (dec == NULL || info == NULL) return 0;
+  *info = dec->info_;
+  return 1;
+}
+
+// Returns true if the frame covers the full canvas.
+static int IsFullFrame(int width, int height, int canvas_width,
+                       int canvas_height) {
+  return (width == canvas_width && height == canvas_height);
+}
+
+// Clear the canvas to transparent.
+static void ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
+                           uint32_t canvas_height) {
+  memset(buf, 0, canvas_width * NUM_CHANNELS * canvas_height);
+}
+
+// Clear given frame rectangle to transparent.
+static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
+                              int y_offset, int width, int height) {
+  int j;
+  assert(width * NUM_CHANNELS <= buf_stride);
+  buf += y_offset * buf_stride + x_offset * NUM_CHANNELS;
+  for (j = 0; j < height; ++j) {
+    memset(buf, 0, width * NUM_CHANNELS);
+    buf += buf_stride;
+  }
+}
+
+// Copy width * height pixels from 'src' to 'dst'.
+static void CopyCanvas(const uint8_t* src, uint8_t* dst,
+                       uint32_t width, uint32_t height) {
+  assert(src != NULL && dst != NULL);
+  memcpy(dst, src, width * NUM_CHANNELS * height);
+}
+
+// Returns true if the current frame is a key-frame.
+static int IsKeyFrame(const WebPIterator* const curr,
+                      const WebPIterator* const prev,
+                      int prev_frame_was_key_frame,
+                      int canvas_width, int canvas_height) {
+  if (curr->frame_num == 1) {
+    return 1;
+  } else if ((!curr->has_alpha || curr->blend_method == WEBP_MUX_NO_BLEND) &&
+             IsFullFrame(curr->width, curr->height,
+                         canvas_width, canvas_height)) {
+    return 1;
+  } else {
+    return (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) &&
+           (IsFullFrame(prev->width, prev->height, canvas_width,
+                        canvas_height) ||
+            prev_frame_was_key_frame);
+  }
+}
+
+
+// Blend a single channel of 'src' over 'dst', given their alpha channel values.
+// 'src' and 'dst' are assumed to be NOT pre-multiplied by alpha.
+static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
+                                      uint32_t dst, uint8_t dst_a,
+                                      uint32_t scale, int shift) {
+  const uint8_t src_channel = (src >> shift) & 0xff;
+  const uint8_t dst_channel = (dst >> shift) & 0xff;
+  const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
+  assert(blend_unscaled < (1ULL << 32) / scale);
+  return (blend_unscaled * scale) >> 24;
+}
+
+// Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
+static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
+  const uint8_t src_a = (src >> 24) & 0xff;
+
+  if (src_a == 0) {
+    return dst;
+  } else {
+    const uint8_t dst_a = (dst >> 24) & 0xff;
+    // This is the approximate integer arithmetic for the actual formula:
+    // dst_factor_a = (dst_a * (255 - src_a)) / 255.
+    const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
+    const uint8_t blend_a = src_a + dst_factor_a;
+    const uint32_t scale = (1UL << 24) / blend_a;
+
+    const uint8_t blend_r =
+        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
+    const uint8_t blend_g =
+        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
+    const uint8_t blend_b =
+        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    assert(src_a + dst_factor_a < 256);
+
+    return (blend_r << 0) |
+           (blend_g << 8) |
+           (blend_b << 16) |
+           ((uint32_t)blend_a << 24);
+  }
+}
+
+// Blend 'num_pixels' in 'src' over 'dst' assuming they are NOT pre-multiplied
+// by alpha.
+static void BlendPixelRowNonPremult(uint32_t* const src,
+                                    const uint32_t* const dst, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    if (src_alpha != 0xff) {
+      src[i] = BlendPixelNonPremult(src[i], dst[i]);
+    }
+  }
+}
+
+// Individually multiply each channel in 'pix' by 'scale'.
+static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
+  uint32_t mask = 0x00FF00FF;
+  uint32_t rb = ((pix & mask) * scale) >> 8;
+  uint32_t ag = ((pix >> 8) & mask) * scale;
+  return (rb & mask) | (ag & ~mask);
+}
+
+// Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
+static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
+  const uint8_t src_a = (src >> 24) & 0xff;
+  return src + ChannelwiseMultiply(dst, 256 - src_a);
+}
+
+// Blend 'num_pixels' in 'src' over 'dst' assuming they are pre-multiplied by
+// alpha.
+static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
+                                 int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    if (src_alpha != 0xff) {
+      src[i] = BlendPixelPremult(src[i], dst[i]);
+    }
+  }
+}
+
+// Returns two ranges (<left, width> pairs) at row 'canvas_y', that belong to
+// 'src' but not 'dst'. A point range is empty if the corresponding width is 0.
+static void FindBlendRangeAtRow(const WebPIterator* const src,
+                                const WebPIterator* const dst, int canvas_y,
+                                int* const left1, int* const width1,
+                                int* const left2, int* const width2) {
+  const int src_max_x = src->x_offset + src->width;
+  const int dst_max_x = dst->x_offset + dst->width;
+  const int dst_max_y = dst->y_offset + dst->height;
+  assert(canvas_y >= src->y_offset && canvas_y < (src->y_offset + src->height));
+  *left1 = -1;
+  *width1 = 0;
+  *left2 = -1;
+  *width2 = 0;
+
+  if (canvas_y < dst->y_offset || canvas_y >= dst_max_y ||
+      src->x_offset >= dst_max_x || src_max_x <= dst->x_offset) {
+    *left1 = src->x_offset;
+    *width1 = src->width;
+    return;
+  }
+
+  if (src->x_offset < dst->x_offset) {
+    *left1 = src->x_offset;
+    *width1 = dst->x_offset - src->x_offset;
+  }
+
+  if (src_max_x > dst_max_x) {
+    *left2 = dst_max_x;
+    *width2 = src_max_x - dst_max_x;
+  }
+}
+
+int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
+                           uint8_t** buf_ptr, int* timestamp_ptr) {
+  WebPIterator iter;
+  uint32_t width;
+  uint32_t height;
+  int is_key_frame;
+  int timestamp;
+  BlendRowFunc blend_row;
+
+  if (dec == NULL || buf_ptr == NULL || timestamp_ptr == NULL) return 0;
+  if (!WebPAnimDecoderHasMoreFrames(dec)) return 0;
+
+  width = dec->info_.canvas_width;
+  height = dec->info_.canvas_height;
+  blend_row = dec->blend_func_;
+
+  // Get compressed frame.
+  if (!WebPDemuxGetFrame(dec->demux_, dec->next_frame_, &iter)) {
+    return 0;
+  }
+  timestamp = dec->prev_frame_timestamp_ + iter.duration;
+
+  // Initialize.
+  is_key_frame = IsKeyFrame(&iter, &dec->prev_iter_,
+                            dec->prev_frame_was_keyframe_, width, height);
+  if (is_key_frame) {
+    ZeroFillCanvas(dec->curr_frame_, width, height);
+  } else {
+    CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_, width, height);
+  }
+
+  // Decode.
+  {
+    const uint8_t* in = iter.fragment.bytes;
+    const size_t in_size = iter.fragment.size;
+    const size_t out_offset =
+        (iter.y_offset * width + iter.x_offset) * NUM_CHANNELS;
+    WebPDecoderConfig* const config = &dec->config_;
+    WebPRGBABuffer* const buf = &config->output.u.RGBA;
+    buf->stride = NUM_CHANNELS * width;
+    buf->size = buf->stride * iter.height;
+    buf->rgba = dec->curr_frame_ + out_offset;
+
+    if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {
+      goto Error;
+    }
+  }
+
+  // During the decoding of current frame, we may have set some pixels to be
+  // transparent (i.e. alpha < 255). However, the value of each of these
+  // pixels should have been determined by blending it against the value of
+  // that pixel in the previous frame if blending method of is WEBP_MUX_BLEND.
+  if (iter.frame_num > 1 && iter.blend_method == WEBP_MUX_BLEND &&
+      !is_key_frame) {
+    if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_NONE) {
+      int y;
+      // Blend transparent pixels with pixels in previous canvas.
+      for (y = 0; y < iter.height; ++y) {
+        const size_t offset =
+            (iter.y_offset + y) * width + iter.x_offset;
+        blend_row((uint32_t*)dec->curr_frame_ + offset,
+                  (uint32_t*)dec->prev_frame_disposed_ + offset, iter.width);
+      }
+    } else {
+      int y;
+      assert(dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND);
+      // We need to blend a transparent pixel with its value just after
+      // initialization. That is, blend it with:
+      // * Fully transparent pixel if it belongs to prevRect <-- No-op.
+      // * The pixel in the previous canvas otherwise <-- Need alpha-blending.
+      for (y = 0; y < iter.height; ++y) {
+        const int canvas_y = iter.y_offset + y;
+        int left1, width1, left2, width2;
+        FindBlendRangeAtRow(&iter, &dec->prev_iter_, canvas_y, &left1, &width1,
+                            &left2, &width2);
+        if (width1 > 0) {
+          const size_t offset1 = canvas_y * width + left1;
+          blend_row((uint32_t*)dec->curr_frame_ + offset1,
+                    (uint32_t*)dec->prev_frame_disposed_ + offset1, width1);
+        }
+        if (width2 > 0) {
+          const size_t offset2 = canvas_y * width + left2;
+          blend_row((uint32_t*)dec->curr_frame_ + offset2,
+                    (uint32_t*)dec->prev_frame_disposed_ + offset2, width2);
+        }
+      }
+    }
+  }
+
+  // Update info of the previous frame and dispose it for the next iteration.
+  dec->prev_frame_timestamp_ = timestamp;
+  dec->prev_iter_ = iter;
+  dec->prev_frame_was_keyframe_ = is_key_frame;
+  CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
+  if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+    ZeroFillFrameRect(dec->prev_frame_disposed_, width * NUM_CHANNELS,
+                      dec->prev_iter_.x_offset, dec->prev_iter_.y_offset,
+                      dec->prev_iter_.width, dec->prev_iter_.height);
+  }
+  ++dec->next_frame_;
+
+  // All OK, fill in the values.
+  *buf_ptr = dec->curr_frame_;
+  *timestamp_ptr = timestamp;
+  return 1;
+
+ Error:
+  WebPDemuxReleaseIterator(&iter);
+  return 0;
+}
+
+int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec) {
+  if (dec == NULL) return 0;
+  return (dec->next_frame_ <= (int)dec->info_.frame_count);
+}
+
+void WebPAnimDecoderReset(WebPAnimDecoder* dec) {
+  if (dec != NULL) {
+    dec->prev_frame_timestamp_ = 0;
+    memset(&dec->prev_iter_, 0, sizeof(dec->prev_iter_));
+    dec->prev_frame_was_keyframe_ = 0;
+    dec->next_frame_ = 1;
+  }
+}
+
+const WebPDemuxer* WebPAnimDecoderGetDemuxer(const WebPAnimDecoder* dec) {
+  if (dec == NULL) return NULL;
+  return dec->demux_;
+}
+
+void WebPAnimDecoderDelete(WebPAnimDecoder* dec) {
+  if (dec != NULL) {
+    WebPDemuxDelete(dec->demux_);
+    WebPSafeFree(dec->curr_frame_);
+    WebPSafeFree(dec->prev_frame_disposed_);
+    WebPSafeFree(dec);
+  }
+}
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -24,8 +24,8 @@
 #include "../webp/format_constants.h"

 #define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 2
+#define DMUX_MIN_VERSION 3
+#define DMUX_REV_VERSION 0

 typedef struct {
  size_t start_;        // start location of the data
@ -47,8 +47,7 @@ typedef struct Frame {
  int duration_;
  WebPMuxAnimDispose dispose_method_;
  WebPMuxAnimBlend blend_method_;
-  int is_fragment_;  // this is a frame fragment (and not a full frame).
-  int frame_num_;  // the referent frame number for use in assembling fragments.
+  int frame_num_;
  int complete_;   // img_components_ contains a full image.
  ChunkData img_components_[2];  // 0=VP8{,L} 1=ALPH
  struct Frame* next_;
@ -193,6 +192,19 @@ static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
  return 1;
 }

+static void SetFrameInfo(size_t start_offset, size_t size,
+                         int frame_num, int complete,
+                         const WebPBitstreamFeatures* const features,
+                         Frame* const frame) {
+  frame->img_components_[0].offset_ = start_offset;
+  frame->img_components_[0].size_ = size;
+  frame->width_ = features->width;
+  frame->height_ = features->height;
+  frame->has_alpha_ |= features->has_alpha;
+  frame->frame_num_ = frame_num;
+  frame->complete_ = complete;
+}
+
 // Store image bearing chunks to 'frame'.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
                              MemBuffer* const mem, Frame* const frame) {
@ -248,13 +260,8 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
            return PARSE_ERROR;
          }
          ++image_chunks;
-          frame->img_components_[0].offset_ = chunk_start_offset;
-          frame->img_components_[0].size_ = chunk_size;
-          frame->width_ = features.width;
-          frame->height_ = features.height;
-          frame->has_alpha_ |= features.has_alpha;
-          frame->frame_num_ = frame_num;
-          frame->complete_ = (status == PARSE_OK);
+          SetFrameInfo(chunk_start_offset, chunk_size, frame_num,
+                       status == PARSE_OK, &features, frame);
          Skip(mem, payload_available);
        } else {
          goto Done;
@ -337,42 +344,6 @@ static ParseStatus ParseAnimationFrame(
  return status;
 }

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-// Parse a 'FRGM' chunk and any image bearing chunks that immediately follow.
-// 'fragment_chunk_size' is the previously validated, padded chunk size.
-static ParseStatus ParseFragment(WebPDemuxer* const dmux,
-                                 uint32_t fragment_chunk_size) {
-  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
-  int added_fragment = 0;
-  MemBuffer* const mem = &dmux->mem_;
-  Frame* frame;
-  ParseStatus status =
-      NewFrame(mem, FRGM_CHUNK_SIZE, fragment_chunk_size, &frame);
-  if (status != PARSE_OK) return status;
-
-  frame->is_fragment_ = 1;
-  frame->x_offset_ = 2 * ReadLE24s(mem);
-  frame->y_offset_ = 2 * ReadLE24s(mem);
-
-  // Store a fragment only if the 'fragments' flag is set and there is some
-  // data available.
-  status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
-  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
-    added_fragment = AddFrame(dmux, frame);
-    if (!added_fragment) {
-      status = PARSE_ERROR;
-    } else {
-      dmux->num_frames_ = 1;
-    }
-  }
-
-  if (!added_fragment) WebPSafeFree(frame);
-  return status;
-}
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
 // General chunk storage, starting with the header at 'start_offset', allowing
 // the user to request the payload via a fourcc string. 'size' includes the
 // header and the unpadded payload size.
@ -513,12 +484,6 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
        status = ParseAnimationFrame(dmux, chunk_size_padded);
        break;
      }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      case MKFOURCC('F', 'R', 'G', 'M'): {
-        status = ParseFragment(dmux, chunk_size_padded);
-        break;
-      }
-#endif
      case MKFOURCC('I', 'C', 'C', 'P'): {
        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
        goto Skip;
@ -606,8 +571,6 @@ static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {

 // If 'exact' is true, check that the image resolution matches the canvas.
 // If 'exact' is false, check that the x/y offsets do not exceed the canvas.
-// TODO(jzern): this is insufficient in the fragmented image case if the
-// expectation is that the fragments completely cover the canvas.
 static int CheckFrameBounds(const Frame* const frame, int exact,
                            int canvas_width, int canvas_height) {
  if (exact) {
@ -635,22 +598,17 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
  if (dmux->loop_count_ < 0) return 0;
  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
-#ifndef WEBP_EXPERIMENTAL_FEATURES
  if (is_fragmented) return 0;
-#endif

  while (f != NULL) {
    const int cur_frame_set = f->frame_num_;
-    int frame_count = 0, fragment_count = 0;
+    int frame_count = 0;

-    // Check frame properties and if the image is composed of fragments that
-    // each fragment came from a fragment.
+    // Check frame properties.
    for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
      const ChunkData* const image = f->img_components_;
      const ChunkData* const alpha = f->img_components_ + 1;

-      if (is_fragmented && !f->is_fragment_) return 0;
-      if (!is_fragmented && f->is_fragment_) return 0;
      if (!is_animation && f->frame_num_ > 1) return 0;

      if (f->complete_) {
@ -675,16 +633,13 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
      }

      if (f->width_ > 0 && f->height_ > 0 &&
-          !CheckFrameBounds(f, !(is_animation || is_fragmented),
+          !CheckFrameBounds(f, !is_animation,
                            dmux->canvas_width_, dmux->canvas_height_)) {
        return 0;
      }

-      fragment_count += f->is_fragment_;
      ++frame_count;
    }
-    if (!is_fragmented && frame_count > 1) return 0;
-    if (fragment_count > 0 && frame_count != fragment_count) return 0;
  }
  return 1;
 }
@ -703,6 +658,41 @@ static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
  dmux->mem_ = *mem;
 }

+static ParseStatus CreateRawImageDemuxer(MemBuffer* const mem,
+                                         WebPDemuxer** demuxer) {
+  WebPBitstreamFeatures features;
+  const VP8StatusCode status =
+      WebPGetFeatures(mem->buf_, mem->buf_size_, &features);
+  *demuxer = NULL;
+  if (status != VP8_STATUS_OK) {
+    return (status == VP8_STATUS_NOT_ENOUGH_DATA) ? PARSE_NEED_MORE_DATA
+                                                  : PARSE_ERROR;
+  }
+
+  {
+    WebPDemuxer* const dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
+    Frame* const frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
+    if (dmux == NULL || frame == NULL) goto Error;
+    InitDemux(dmux, mem);
+    SetFrameInfo(0, mem->buf_size_, 1 /*frame_num*/, 1 /*complete*/, &features,
+                 frame);
+    if (!AddFrame(dmux, frame)) goto Error;
+    dmux->state_ = WEBP_DEMUX_DONE;
+    dmux->canvas_width_ = frame->width_;
+    dmux->canvas_height_ = frame->height_;
+    dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
+    dmux->num_frames_ = 1;
+    assert(IsValidSimpleFormat(dmux));
+    *demuxer = dmux;
+    return PARSE_OK;
+
+ Error:
+    WebPSafeFree(dmux);
+    WebPSafeFree(frame);
+    return PARSE_ERROR;
+  }
+}
+
 WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
                               WebPDemuxState* state, int version) {
  const ChunkParser* parser;
@ -719,6 +709,15 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
  status = ReadHeader(&mem);
  if (status != PARSE_OK) {
+    // If parsing of the webp file header fails attempt to handle a raw
+    // VP8/VP8L frame. Note 'allow_partial' is ignored in this case.
+    if (status == PARSE_ERROR) {
+      status = CreateRawImageDemuxer(&mem, &dmux);
+      if (status == PARSE_OK) {
+        if (state != NULL) *state = WEBP_DEMUX_DONE;
+        return dmux;
+      }
+    }
    if (state != NULL) {
      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
                                                : WEBP_DEMUX_PARSE_ERROR;
@ -790,8 +789,6 @@ uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
 // -----------------------------------------------------------------------------
 // Frame iteration

-// Find the first 'frame_num' frame. There may be multiple such frames in a
-// fragmented frame.
 static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
  const Frame* f;
  for (f = dmux->frames_; f != NULL; f = f->next_) {
@ -800,21 +797,6 @@ static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
  return f;
 }

-// Returns fragment 'fragment_num' and the total count.
-static const Frame* GetFragment(
-    const Frame* const frame_set, int fragment_num, int* const count) {
-  const int this_frame = frame_set->frame_num_;
-  const Frame* f = frame_set;
-  const Frame* fragment = NULL;
-  int total;
-
-  for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
-    if (++total == fragment_num) fragment = f;
-  }
-  *count = total;
-  return fragment;
-}
-
 static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
                                      const Frame* const frame,
                                      size_t* const data_size) {
@ -841,34 +823,27 @@ static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,

 // Create a whole 'frame' from VP8 (+ alpha) or lossless.
 static int SynthesizeFrame(const WebPDemuxer* const dmux,
-                           const Frame* const first_frame,
-                           int fragment_num, WebPIterator* const iter) {
+                           const Frame* const frame,
+                           WebPIterator* const iter) {
  const uint8_t* const mem_buf = dmux->mem_.buf_;
-  int num_fragments;
  size_t payload_size = 0;
-  const Frame* const fragment =
-      GetFragment(first_frame, fragment_num, &num_fragments);
-  const uint8_t* const payload =
-      GetFramePayload(mem_buf, fragment, &payload_size);
+  const uint8_t* const payload = GetFramePayload(mem_buf, frame, &payload_size);
  if (payload == NULL) return 0;
-  assert(first_frame != NULL);
+  assert(frame != NULL);

-  iter->frame_num      = first_frame->frame_num_;
+  iter->frame_num      = frame->frame_num_;
  iter->num_frames     = dmux->num_frames_;
-  iter->fragment_num   = fragment_num;
-  iter->num_fragments  = num_fragments;
-  iter->x_offset       = fragment->x_offset_;
-  iter->y_offset       = fragment->y_offset_;
-  iter->width          = fragment->width_;
-  iter->height         = fragment->height_;
-  iter->has_alpha      = fragment->has_alpha_;
-  iter->duration       = fragment->duration_;
-  iter->dispose_method = fragment->dispose_method_;
-  iter->blend_method   = fragment->blend_method_;
-  iter->complete       = fragment->complete_;
+  iter->x_offset       = frame->x_offset_;
+  iter->y_offset       = frame->y_offset_;
+  iter->width          = frame->width_;
+  iter->height         = frame->height_;
+  iter->has_alpha      = frame->has_alpha_;
+  iter->duration       = frame->duration_;
+  iter->dispose_method = frame->dispose_method_;
+  iter->blend_method   = frame->blend_method_;
+  iter->complete       = frame->complete_;
  iter->fragment.bytes = payload;
  iter->fragment.size  = payload_size;
-  // TODO(jzern): adjust offsets for 'FRGM's embedded in 'ANMF's
  return 1;
 }

@ -882,7 +857,7 @@ static int SetFrame(int frame_num, WebPIterator* const iter) {
  frame = GetFrame(dmux, frame_num);
  if (frame == NULL) return 0;

-  return SynthesizeFrame(dmux, frame, 1, iter);
+  return SynthesizeFrame(dmux, frame, iter);
 }

 int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
@ -904,17 +879,6 @@ int WebPDemuxPrevFrame(WebPIterator* iter) {
  return SetFrame(iter->frame_num - 1, iter);
 }

-int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
-  if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
-    const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
-    const Frame* const frame = GetFrame(dmux, iter->frame_num);
-    if (frame == NULL) return 0;
-
-    return SynthesizeFrame(dmux, frame, fragment_num, iter);
-  }
-  return 0;
-}
-
 void WebPDemuxReleaseIterator(WebPIterator* iter) {
  (void)iter;
 }
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -1,5 +1,6 @@
 noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
 noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
+noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la

 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@ -10,63 +11,107 @@ commondir = $(includedir)/webp

 COMMON_SOURCES =
 COMMON_SOURCES += alpha_processing.c
+COMMON_SOURCES += alpha_processing_mips_dsp_r2.c
 COMMON_SOURCES += cpu.c
 COMMON_SOURCES += dec.c
 COMMON_SOURCES += dec_clip_tables.c
 COMMON_SOURCES += dec_mips32.c
+COMMON_SOURCES += dec_mips_dsp_r2.c
 COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dsp.h
+COMMON_SOURCES += filters.c
+COMMON_SOURCES += filters_mips_dsp_r2.c
 COMMON_SOURCES += lossless.c
 COMMON_SOURCES += lossless.h
-COMMON_SOURCES += lossless_mips32.c
+COMMON_SOURCES += lossless_mips_dsp_r2.c
 COMMON_SOURCES += lossless_neon.c
+COMMON_SOURCES += mips_macro.h
 COMMON_SOURCES += neon.h
+COMMON_SOURCES += rescaler.c
+COMMON_SOURCES += rescaler_mips32.c
+COMMON_SOURCES += rescaler_mips_dsp_r2.c
+COMMON_SOURCES += rescaler_neon.c
 COMMON_SOURCES += upsampling.c
+COMMON_SOURCES += upsampling_mips_dsp_r2.c
 COMMON_SOURCES += upsampling_neon.c
 COMMON_SOURCES += yuv.c
 COMMON_SOURCES += yuv.h
 COMMON_SOURCES += yuv_mips32.c
+COMMON_SOURCES += yuv_mips_dsp_r2.c

 ENC_SOURCES =
+ENC_SOURCES += argb.c
+ENC_SOURCES += argb_mips_dsp_r2.c
+ENC_SOURCES += cost.c
+ENC_SOURCES += cost_mips32.c
+ENC_SOURCES += cost_mips_dsp_r2.c
 ENC_SOURCES += enc.c
 ENC_SOURCES += enc_mips32.c
+ENC_SOURCES += enc_mips_dsp_r2.c
 ENC_SOURCES += enc_neon.c
+ENC_SOURCES += lossless_enc.c
+ENC_SOURCES += lossless_enc_mips32.c
+ENC_SOURCES += lossless_enc_mips_dsp_r2.c
+ENC_SOURCES += lossless_enc_neon.c

 libwebpdsp_avx2_la_SOURCES =
 libwebpdsp_avx2_la_SOURCES += enc_avx2.c
 libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)

+libwebpdspdecode_sse41_la_SOURCES =
+libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c
+libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c
+libwebpdspdecode_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdspdecode_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
+
 libwebpdspdecode_sse2_la_SOURCES =
 libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += filters_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += lossless_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += rescaler_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += upsampling_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += yuv_sse2.c
-libwebpdspdecode_sse2_la_SOURCES += yuv_tables_sse2.h
 libwebpdspdecode_sse2_la_CPPFLAGS = $(libwebpdsp_sse2_la_CPPFLAGS)
 libwebpdspdecode_sse2_la_CFLAGS = $(libwebpdsp_sse2_la_CFLAGS)

 libwebpdsp_sse2_la_SOURCES =
+libwebpdsp_sse2_la_SOURCES += argb_sse2.c
+libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
+libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
 libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
 libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la

+libwebpdsp_sse41_la_SOURCES =
+libwebpdsp_sse41_la_SOURCES += enc_sse41.c
+libwebpdsp_sse41_la_SOURCES += lossless_enc_sse41.c
+libwebpdsp_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
+libwebpdsp_sse41_la_LIBADD = libwebpdspdecode_sse41.la
+
 libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)

 noinst_HEADERS =
 noinst_HEADERS += ../dec/decode_vp8.h
 noinst_HEADERS += ../webp/decode.h

-libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
+libwebpdsp_la_CPPFLAGS =
+libwebpdsp_la_CPPFLAGS += $(AM_CPPFLAGS)
+libwebpdsp_la_CPPFLAGS += $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
 libwebpdsp_la_LDFLAGS = -lm
-libwebpdsp_la_LIBADD = libwebpdsp_avx2.la libwebpdsp_sse2.la
+libwebpdsp_la_LIBADD =
+libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
+libwebpdsp_la_LIBADD += libwebpdsp_sse41.la

 if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)

  libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
-  libwebpdspdecode_la_LIBADD = libwebpdspdecode_sse2.la
+  libwebpdspdecode_la_LIBADD =
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la
 endif
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -134,7 +134,7 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {

 #endif    // USE_TABLES_FOR_ALPHA_MULT

-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t argb = ptr[x];
@ -154,8 +154,8 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  }
 }

-static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                    int width, int inverse) {
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t a = alpha[x];
@ -284,6 +284,38 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }

+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  return (alpha_mask != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      dst[i] = alpha[i] << 8;  // leave A/R/B channels zero'd.
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
@ -304,23 +336,29 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,

 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);

 //------------------------------------------------------------------------------
 // Init function

+extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
+extern void WebPInitAlphaProcessingSSE41(void);

 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;

-void WebPInitAlphaProcessing(void) {
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;

-  WebPMultARGBRow = MultARGBRow;
-  WebPMultRow = MultRow;
+  WebPMultARGBRow = WebPMultARGBRowC;
+  WebPMultRow = WebPMultRowC;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
  WebPExtractAlpha = ExtractAlpha;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@ -328,6 +366,16 @@ void WebPInitAlphaProcessing(void) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitAlphaProcessingSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        WebPInitAlphaProcessingSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitAlphaProcessingMIPSdspR2();
    }
 #endif
  }
--- a/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/src/dsp/alpha_processing_mips_dsp_r2.c
@ -0,0 +1,141 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffff;
+  int i, j, temp0;
+
+  for (j = 0; j < height; ++j) {
+    uint8_t* pdst = dst;
+    const uint8_t* palpha = alpha;
+    for (i = 0; i < (width >> 2); ++i) {
+      int temp1, temp2, temp3;
+
+      __asm__ volatile (
+        "ulw    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     4            \n\t"
+        "addiu  %[pdst],       %[pdst],       16           \n\t"
+        "srl    %[temp1],      %[temp0],      8            \n\t"
+        "srl    %[temp2],      %[temp0],      16           \n\t"
+        "srl    %[temp3],      %[temp0],      24           \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "sb     %[temp0],      -16(%[pdst])                \n\t"
+        "sb     %[temp1],      -12(%[pdst])                \n\t"
+        "sb     %[temp2],      -8(%[pdst])                 \n\t"
+        "sb     %[temp3],      -4(%[pdst])                 \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+          [temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+
+    for (i = 0; i < (width & 3); ++i) {
+      __asm__ volatile (
+        "lbu    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     1            \n\t"
+        "sb     %[temp0],      0(%[pdst])                  \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "addiu  %[pdst],       %[pdst],       4            \n\t"
+        : [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  __asm__ volatile (
+    "ext    %[temp0],      %[alpha_mask], 0, 16            \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 16               \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    "ext    %[temp0],      %[alpha_mask], 0, 8             \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 8                \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    : [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
+    :
+  );
+
+  return (alpha_mask != 0xff);
+}
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x;
+  const uint32_t c_00ffffff = 0x00ffffffu;
+  const uint32_t c_ff000000 = 0xff000000u;
+  const uint32_t c_8000000  = 0x00800000u;
+  const uint32_t c_8000080  = 0x00800080u;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        int temp0, temp1, temp2, temp3, alpha;
+        __asm__ volatile (
+          "srl          %[alpha],   %[argb],       24                \n\t"
+          "replv.qb     %[temp0],   %[alpha]                         \n\t"
+          "and          %[temp0],   %[temp0],      %[c_00ffffff]     \n\t"
+          "beqz         %[inverse], 0f                               \n\t"
+          "divu         $zero,      %[c_ff000000], %[alpha]          \n\t"
+          "mflo         %[temp0]                                     \n\t"
+        "0:                                                          \n\t"
+          "andi         %[temp1],   %[argb],       0xff              \n\t"
+          "ext          %[temp2],   %[argb],       8,             8  \n\t"
+          "ext          %[temp3],   %[argb],       16,            8  \n\t"
+          "mul          %[temp1],   %[temp1],      %[temp0]          \n\t"
+          "mul          %[temp2],   %[temp2],      %[temp0]          \n\t"
+          "mul          %[temp3],   %[temp3],      %[temp0]          \n\t"
+          "precrq.ph.w  %[temp1],   %[temp2],      %[temp1]          \n\t"
+          "addu         %[temp3],   %[temp3],      %[c_8000000]      \n\t"
+          "addu         %[temp1],   %[temp1],      %[c_8000080]      \n\t"
+          "precrq.ph.w  %[temp3],   %[argb],       %[temp3]          \n\t"
+          "precrq.qb.ph %[temp1],   %[temp3],      %[temp1]          \n\t"
+          : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+            [temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
+          : [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
+            [c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
+            [c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
+          : "memory", "hi", "lo"
+        );
+        ptr[x] = temp1;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPMultARGBRow = MultARGBRow;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -18,6 +18,86 @@

 //------------------------------------------------------------------------------

+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    __m128i* out = (__m128i*)dst;
+    for (i = 0; i < limit; i += 8) {
+      // load 8 alpha bytes
+      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
+      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
+      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
+      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
+      // load 8 dst pixels (32 bytes)
+      const __m128i b0_lo = _mm_loadu_si128(out + 0);
+      const __m128i b0_hi = _mm_loadu_si128(out + 1);
+      // mask dst alpha values
+      const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
+      const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
+      // combine
+      const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
+      const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
+      // store
+      _mm_storeu_si128(out + 0, b2_lo);
+      _mm_storeu_si128(out + 1, b2_hi);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, a0);
+      out += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  const __m128i zero = _mm_setzero_si128();
+  const int limit = width & ~15;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < limit; i += 16) {   // process 16 alpha bytes
+      const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
+      const __m128i a1 = _mm_unpacklo_epi8(zero, a0);  // note the 'zero' first!
+      const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
+      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
+      const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
+      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
+      const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], a2_lo);
+      _mm_storeu_si128((__m128i*)&dst[i +  4], a2_hi);
+      _mm_storeu_si128((__m128i*)&dst[i +  8], b2_lo);
+      _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
+    }
+    for (; i < width; ++i) dst[i] = alpha[i] << 8;
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
@ -63,15 +143,156 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
  return (alpha_and == 0xff);
 }

-#endif   // WEBP_USE_SSE2
+//------------------------------------------------------------------------------
+// Non-dither premultiplied modes
+
+#define MULTIPLIER(a)   ((a) * 0x8081)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+
+// We can't use a 'const int' for the SHUFFLE value, because it has to be an
+// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.
+#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do {             \
+  const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX));     \
+  const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);         \
+  const __m128i alpha0 = _mm_and_si128(argb1, MASK);            \
+  const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE);  \
+  const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE);  \
+  /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */                       \
+  const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT);         \
+  const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT);         \
+  const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);         \
+  const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);         \
+  const __m128i argb4 = _mm_adds_epu16(argb2, argb3);           \
+  const __m128i argb5 = _mm_srli_epi16(argb4, 7);               \
+  const __m128i argb6 = _mm_or_si128(argb5, alpha0);            \
+  const __m128i argb7 = _mm_packus_epi16(argb6, zero);          \
+  _mm_storel_epi64((__m128i*)&(RGBX), argb7);                   \
+} while (0)
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const int kSpan = 2;
+  const int w2 = w & ~(kSpan - 1);
+  while (h-- > 0) {
+    uint32_t* const rgbx = (uint32_t*)rgba;
+    int i;
+    if (!alpha_first) {
+      const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
+      const __m128i kMult =
+          _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
+      for (i = 0; i < w2; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
+      }
+    } else {
+      const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
+      const __m128i kMult =
+          _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
+      for (i = 0; i < w2; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
+      }
+    }
+    // Finish with left-overs.
+    for (; i < w; ++i) {
+      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// -----------------------------------------------------------------------------
+// Apply alpha value to rows
+
+// We use: kINV255 = (1 << 24) / 255 = 0x010101
+// So: a * kINV255 = (a << 16) | [(a << 8) | a]
+// -> _mm_mulhi_epu16() takes care of the (a<<16) part,
+// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x = 0;
+  if (!inverse) {
+    const int kSpan = 2;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kRound =
+        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
+    const __m128i kMult =
+        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
+    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
+    const int w2 = width & ~(kSpan - 1);
+    for (x = 0; x < w2; x += kSpan) {
+      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
+      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
+      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
+      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
+      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
+      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
+      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
+      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
+      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
+      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
+      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
+      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
+    }
+  }
+  width -= x;
+  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+}
+
+static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse) {
+  int x = 0;
+  if (!inverse) {
+    const int kSpan = 8;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kRound = _mm_set1_epi16(1 << 7);
+    const int w2 = width & ~(kSpan - 1);
+    for (x = 0; x < w2; x += kSpan) {
+      const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
+      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
+      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
+      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
+      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
+      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
+      const __m128i v4 = _mm_adds_epu16(v2, v3);
+      const __m128i v5 = _mm_adds_epu16(v4, kRound);
+      const __m128i v6 = _mm_srli_epi16(v5, 8);
+      const __m128i v7 = _mm_packus_epi16(v6, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], v7);
+    }
+  }
+  width -= x;
+  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+}

 //------------------------------------------------------------------------------
-// Init function
+// Entry point

 extern void WebPInitAlphaProcessingSSE2(void);

-void WebPInitAlphaProcessingSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
+  WebPMultARGBRow = MultARGBRow;
+  WebPMultRow = MultRow;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
  WebPExtractAlpha = ExtractAlpha;
-#endif
 }
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/alpha_processing_sse41.c
+++ b/src/dsp/alpha_processing_sse41.c
@ -0,0 +1,92 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel, SSE4.1 variant.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <smmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i all_0xff = _mm_set1_epi32(~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~15;
+  const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                          -1, -1, -1, -1, 12, 8, 4, 0);
+  const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                          12, 8, 4, 0, -1, -1, -1, -1);
+  const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 16) {
+      // load 64 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i a2 = _mm_loadu_si128(src + 2);
+      const __m128i a3 = _mm_loadu_si128(src + 3);
+      const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0);
+      const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1);
+      const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2);
+      const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3);
+      const __m128i c0 = _mm_or_si128(b0, b1);
+      const __m128i c1 = _mm_or_si128(b2, b3);
+      const __m128i d0 = _mm_or_si128(c0, c1);
+      // store
+      _mm_storeu_si128((__m128i*)&alpha[i], d0);
+      // accumulate sixteen alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 4;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the sixteen alpha 'and' into an 8-bit mask.
+  alpha_and |= 0xff00u;  // pretend the upper bits [8..15] were tested ok.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xffffu);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
+  WebPExtractAlpha = ExtractAlpha;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/src/dsp/argb.c
+++ b/src/dsp/argb.c
@ -0,0 +1,68 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions.
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
+
+void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                    const uint8_t*, int, uint32_t*);
+void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                   int, int, uint32_t*);
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+extern void VP8EncDspARGBInitSSE2(void);
+
+static volatile VP8CPUInfo argb_last_cpuinfo_used =
+    (VP8CPUInfo)&argb_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
+  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspARGBInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspARGBInitMIPSdspR2();
+    }
+#endif
+  }
+  argb_last_cpuinfo_used = VP8GetCPUInfo;
+}
--- a/src/dsp/argb_mips_dsp_r2.c
+++ b/src/dsp/argb_mips_dsp_r2.c
@ -0,0 +1,110 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (mips version).
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/argb_sse2.c
+++ b/src/dsp/argb_sse2.c
@ -0,0 +1,67 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (SSE2 version).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
+    int i = 0;
+    const int len_max = len & ~3;  // max length processed in main loop
+    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
+    assert(b == r + 2);
+    assert(a == r + 3);
+    for (; i < len_max; i += 4) {
+      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
+      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
+      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
+      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i F = _mm_or_si128(E, C);
+      _mm_storeu_si128((__m128i*)(out + i), F);
+    }
+    for (; i < len; ++i) {
+      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+    }
+  } else {
+    assert(g == b + 1);
+    assert(r == b + 2);
+    assert(a == b + 3);
+    memcpy(out, b, len * 4);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
+  VP8PackARGB = PackARGB;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@ -0,0 +1,412 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+#include "../enc/cost.h"
+
+//------------------------------------------------------------------------------
+// Boolean-cost cost table
+
+const uint16_t VP8EntropyCost[256] = {
+  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
+  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
+   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
+   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
+   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
+   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
+   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
+   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
+   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
+   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
+   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
+   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
+   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
+   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
+   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
+   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
+   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
+   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
+   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
+   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
+    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
+    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
+    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
+    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
+    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
+    10,    9,    7,    6,    4,    3
+};
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// fixed costs for coding levels, deduce from the coding tree.
+// This is only the part that doesn't depend on the probability state.
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+     0,  256,  256,  256,  256,  432,  618,  630,
+   731,  640,  640,  828,  901,  948, 1021, 1101,
+  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
+  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
+  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
+  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
+  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
+  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
+  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
+  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
+  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
+  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
+  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
+  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
+  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
+  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
+  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
+  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
+  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
+  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
+  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
+  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
+  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
+  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
+  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
+  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
+  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
+  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
+  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
+  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
+  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
+  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
+  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
+  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
+  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
+  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
+  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
+  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
+  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
+  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
+  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
+  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
+  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
+  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
+  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
+  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
+  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
+  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
+  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
+  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
+  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
+  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
+  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
+  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
+  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
+  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
+  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
+  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
+  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
+  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
+  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
+  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
+  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
+  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
+  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
+  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
+  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
+  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
+  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
+  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
+  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
+  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
+  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
+  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
+  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
+  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
+  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
+  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
+  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
+  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
+  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
+  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
+  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
+  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
+  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
+  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
+  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
+  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
+  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
+  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
+  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
+  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
+  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
+  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
+  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
+  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
+  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
+  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
+  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
+  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
+  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
+  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
+  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
+  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
+  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
+  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
+  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
+  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
+  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
+  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
+  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
+  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
+  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
+  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
+  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
+  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
+  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
+  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
+  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
+  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
+  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
+  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
+  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
+  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
+  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
+  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
+  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
+  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
+};
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8EncBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // sentinel
+};
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+VP8GetResidualCostFunc VP8GetResidualCost;
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8EncDspCostInitMIPS32(void);
+extern void VP8EncDspCostInitMIPSdspR2(void);
+extern void VP8EncDspCostInitSSE2(void);
+
+static volatile VP8CPUInfo cost_last_cpuinfo_used =
+    (VP8CPUInfo)&cost_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
+  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspCostInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspCostInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspCostInitSSE2();
+    }
+#endif
+  }
+
+  cost_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@ -0,0 +1,154 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/cost.h"
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                        \n\t"
+    ".set      noreorder                                                   \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                   \n\t"
+    "sll       %[temp0],        %[n],               1                      \n\t"
+    "blez      %[temp1],        2f                                         \n\t"
+    " addu     %[res_coeffs],   %[res_coeffs],      %[temp0]               \n\t"
+  "1:                                                                      \n\t"
+    "lh        %[v_reg],        0(%[res_coeffs])                           \n\t"
+    "addiu     %[n],            %[n],               1                      \n\t"
+    "negu      %[temp0],        %[v_reg]                                   \n\t"
+    "slti      %[temp1],        %[v_reg],           0                      \n\t"
+    "movn      %[v_reg],        %[temp0],           %[temp1]               \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                      \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                   \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]               \n\t"
+    "sll       %[temp1],        %[v_reg],           1                      \n\t"
+    "addu      %[temp1],        %[temp1],           %[VP8LevelFixedCosts]  \n\t"
+    "lhu       %[temp1],        0(%[temp1])                                \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]     \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]               \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]               \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                      \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                      \n\t"
+    "addu      %[v_reg],        %[v_reg],           %[t]                   \n\t"
+    "lhu       %[temp0],        0(%[v_reg])                                \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]         \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]             \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]               \n\t"
+    "addiu     %[res_coeffs],   %[res_coeffs],      2                      \n\t"
+    "bne       %[n],            %[res_last],        1b                     \n\t"
+    " lw       %[t],            0(%[t])                                    \n\t"
+  "2:                                                                      \n\t"
+    ".set      pop                                                         \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  const int16_t* p_coeffs = (int16_t*)coeffs;
+  int temp0, temp1, temp2, n, n1;
+  assert(res->first == 0 || coeffs[0] == 0);
+
+  __asm__ volatile (
+    ".set     push                                      \n\t"
+    ".set     noreorder                                 \n\t"
+    "addiu    %[p_coeffs],   %[p_coeffs],    28         \n\t"
+    "li       %[n],          15                         \n\t"
+    "li       %[temp2],      -1                         \n\t"
+  "0:                                                   \n\t"
+    "ulw      %[temp0],      0(%[p_coeffs])             \n\t"
+    "beqz     %[temp0],      1f                         \n\t"
+#if defined(WORDS_BIGENDIAN)
+    " sll     %[temp1],      %[temp0],       16         \n\t"
+#else
+    " srl     %[temp1],      %[temp0],       16         \n\t"
+#endif
+    "addiu    %[n1],         %[n],           -1         \n\t"
+    "movz     %[temp0],      %[n1],          %[temp1]   \n\t"
+    "movn     %[temp0],      %[n],           %[temp1]   \n\t"
+    "j        2f                                        \n\t"
+    " addiu   %[temp2],      %[temp0],       0          \n\t"
+  "1:                                                   \n\t"
+    "addiu    %[n],          %[n],           -2         \n\t"
+    "bgtz     %[n],          0b                         \n\t"
+    " addiu   %[p_coeffs],   %[p_coeffs],    -4         \n\t"
+  "2:                                                   \n\t"
+    ".set     pop                                       \n\t"
+    : [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [n]"=&r"(n), [n1]"=&r"(n1)
+    :
+    : "memory"
+  );
+  res->last = temp2;
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/src/dsp/cost_mips_dsp_r2.c
+++ b/src/dsp/cost_mips_dsp_r2.c
@ -0,0 +1,107 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../enc/cost.h"
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                     \n\t"
+    ".set      noreorder                                                \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                \n\t"
+    "blez      %[temp1],        2f                                      \n\t"
+    " nop                                                               \n\t"
+  "1:                                                                   \n\t"
+    "sll       %[temp0],        %[n],               1                   \n\t"
+    "lhx       %[v_reg],        %[temp0](%[res_coeffs])                 \n\t"
+    "addiu     %[n],            %[n],               1                   \n\t"
+    "absq_s.w  %[v_reg],        %[v_reg]                                \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                   \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]            \n\t"
+    "sll       %[temp1],        %[v_reg],           1                   \n\t"
+    "lhx       %[temp1],        %[temp1](%[VP8LevelFixedCosts])         \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]  \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]            \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]            \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                   \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                   \n\t"
+    "lhx       %[temp0],        %[v_reg](%[t])                          \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]      \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]          \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]            \n\t"
+    "bne       %[n],            %[res_last],        1b                  \n\t"
+    " lw       %[t],            0(%[t])                                 \n\t"
+  "2:                                                                   \n\t"
+    ".set      pop                                                      \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
+  VP8GetResidualCost = GetResidualCost;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of cost functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include "../enc/cost.h"
+#include "../enc/vp8enci.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                  VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE2 to compare 16 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_packs_epi16(c0, c1);
+  const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
+  // Get the comparison results as a bitmask into 16bits. Negate the mask to get
+  // the position of entries that are not equal to zero. We don't need to mask
+  // out least significant bits according to res->first, since coeffs[0] is 0
+  // if res->first > 0.
+  const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? BitsLog2Floor(mask) : -1;
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kCst2 = _mm_set1_epi8(2);
+    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
+    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
+    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
+    const __m128i D0 = _mm_sub_epi16(zero, c0);
+    const __m128i D1 = _mm_sub_epi16(zero, c1);
+    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
+    const __m128i E1 = _mm_max_epi16(c1, D1);
+    const __m128i F = _mm_packs_epi16(E0, E1);
+    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
+    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]
+
+    _mm_storeu_si128((__m128i*)&ctxs[0], G);
+    _mm_storeu_si128((__m128i*)&levels[0], H);
+
+    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
+    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
+  VP8GetResidualCost = GetResidualCostSSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -13,7 +13,7 @@

 #include "./dsp.h"

-#if defined(__ANDROID__)
+#if defined(WEBP_ANDROID_NEON)
 #include <cpu-features.h>
 #endif

@ -31,6 +31,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type), "c"(0));
 }
+#elif defined(__x86_64__) && \
+      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
+      defined(__PIC__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "xchg{q}\t{%%rbx}, %q1\n"
+    "cpuid\n"
+    "xchg{q}\t{%%rbx}, %q1\n"
+    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
+      "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
@ -79,7 +91,16 @@ static WEBP_INLINE uint64_t xgetbv(void) {

 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
+  int max_cpuid_value;
  int cpu_info[4];
+
+  // get the highest feature value cpuid supports
+  GetCPUInfo(cpu_info, 0);
+  max_cpuid_value = cpu_info[0];
+  if (max_cpuid_value < 1) {
+    return 0;
+  }
+
  GetCPUInfo(cpu_info, 1);
  if (feature == kSSE2) {
    return 0 != (cpu_info[3] & 0x04000000);
@ -87,6 +108,9 @@ static int x86CPUInfo(CPUFeature feature) {
  if (feature == kSSE3) {
    return 0 != (cpu_info[2] & 0x00000001);
  }
+  if (feature == kSSE4_1) {
+    return 0 != (cpu_info[2] & 0x00080000);
+  }
  if (feature == kAVX) {
    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
@ -95,7 +119,7 @@ static int x86CPUInfo(CPUFeature feature) {
    }
  }
  if (feature == kAVX2) {
-    if (x86CPUInfo(kAVX)) {
+    if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
      GetCPUInfo(cpu_info, 7);
      return ((cpu_info[1] & 0x00000020) == 0x00000020);
    }
@ -122,10 +146,14 @@ static int armCPUInfo(CPUFeature feature) {
  return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
-#elif defined(WEBP_USE_MIPS32)
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2)
 static int mipsCPUInfo(CPUFeature feature) {
-  (void)feature;
-  return 1;
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2)) {
+    return 1;
+  } else {
+    return 0;
+  }
+
 }
 VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -7,7 +7,7 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Speed-critical decoding functions.
+// Speed-critical decoding functions, default plain-C implementations.
 //
 // Author: Skal (pascal.massimino@gmail.com)

@ -34,9 +34,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
  STORE(3, y, DC - (d));            \
 } while (0)

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
+#define MUL1(a) ((((a) * 20091) >> 16) + (a))
+#define MUL2(a) (((a) * 35468) >> 16)

 static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
@ -45,8 +44,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  for (i = 0; i < 4; ++i) {    // vertical pass
    const int a = in[0] + in[8];    // [-4096, 4094]
    const int b = in[0] - in[8];    // [-4095, 4095]
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    const int c = MUL2(in[4]) - MUL1(in[12]);   // [-3783, 3783]
+    const int d = MUL1(in[4]) + MUL2(in[12]);   // [-3785, 3781]
    tmp[0] = a + d;   // [-7881, 7875]
    tmp[1] = b + c;   // [-7878, 7878]
    tmp[2] = b - c;   // [-7878, 7878]
@ -55,7 +54,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    in++;
  }
  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
-  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // The exact value is (2. + (20091 + 35468) / 65536).
  // After the second pass, maximum interval is [-3794, 3794], assuming
  // an input in [-2048, 2047] interval. We then need to add a dst value
  // in the [0, 255] range.
@ -66,8 +65,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    const int dc = tmp[0] + 4;
    const int a =  dc +  tmp[8];
    const int b =  dc -  tmp[8];
-    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
-    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
+    const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
    STORE(0, 0, a + d);
    STORE(1, 0, b + c);
    STORE(2, 0, b - c);
@ -80,16 +79,17 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
-  const int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  const int c4 = MUL2(in[4]);
+  const int d4 = MUL1(in[4]);
+  const int c1 = MUL2(in[1]);
+  const int d1 = MUL1(in[1]);
  STORE2(0, a + d4, d1, c1);
  STORE2(1, a + c4, d1, c1);
  STORE2(2, a - c4, d1, c1);
  STORE2(3, a - d4, d1, c1);
 }
-#undef MUL
+#undef MUL1
+#undef MUL2
 #undef STORE2

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
@ -104,7 +104,7 @@ static void TransformUV(const int16_t* in, uint8_t* dst) {
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

-static void TransformDC(const int16_t *in, uint8_t* dst) {
+static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
@ -160,7 +160,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

 #define DST(x, y) dst[(x) + (y) * BPS]

-static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = VP8kclip1 - top[-1];
  int y;
@ -173,21 +173,21 @@ static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
    dst += BPS;
  }
 }
-static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }

 //------------------------------------------------------------------------------
 // 16x16

-static void VE16(uint8_t *dst) {     // vertical
+static void VE16(uint8_t* dst) {     // vertical
  int j;
  for (j = 0; j < 16; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 16);
  }
 }

-static void HE16(uint8_t *dst) {     // horizontal
+static void HE16(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    memset(dst, dst[-1], 16);
@ -202,7 +202,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  }
 }

-static void DC16(uint8_t *dst) {    // DC
+static void DC16(uint8_t* dst) {    // DC
  int DC = 16;
  int j;
  for (j = 0; j < 16; ++j) {
@ -211,7 +211,7 @@ static void DC16(uint8_t *dst) {    // DC
  Put16(DC >> 5, dst);
 }

-static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
@ -220,7 +220,7 @@ static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
  int DC = 8;
  int i;
  for (i = 0; i < 16; ++i) {
@ -229,17 +229,19 @@ static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }

+VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
+
 //------------------------------------------------------------------------------
 // 4x4

 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

-static void VE4(uint8_t *dst) {    // vertical
+static void VE4(uint8_t* dst) {    // vertical
  const uint8_t* top = dst - BPS;
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
@ -253,19 +255,19 @@ static void VE4(uint8_t *dst) {    // vertical
  }
 }

-static void HE4(uint8_t *dst) {    // horizontal
+static void HE4(uint8_t* dst) {    // horizontal
  const int A = dst[-1 - BPS];
  const int B = dst[-1];
  const int C = dst[-1 + BPS];
  const int D = dst[-1 + 2 * BPS];
  const int E = dst[-1 + 3 * BPS];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }

-static void DC4(uint8_t *dst) {   // DC
+static void DC4(uint8_t* dst) {   // DC
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@ -273,7 +275,7 @@ static void DC4(uint8_t *dst) {   // DC
  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }

-static void RD4(uint8_t *dst) {   // Down-right
+static void RD4(uint8_t* dst) {   // Down-right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -284,15 +286,15 @@ static void RD4(uint8_t *dst) {   // Down-right
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
  DST(0, 3)                                     = AVG3(J, K, L);
-  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
-  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
-  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
-  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
-  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
-  DST(3, 0)                                     = AVG3(D, C, B);
+  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+                                      DST(3, 0) = AVG3(D, C, B);
 }

-static void LD4(uint8_t *dst) {   // Down-Left
+static void LD4(uint8_t* dst) {   // Down-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@ -305,12 +307,12 @@ static void LD4(uint8_t *dst) {   // Down-Left
  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
-  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
-  DST(3, 3)                                     = AVG3(G, H, H);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = AVG3(G, H, H);
 }

-static void VR4(uint8_t *dst) {   // Vertical-Right
+static void VR4(uint8_t* dst) {   // Vertical-Right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -332,7 +334,7 @@ static void VR4(uint8_t *dst) {   // Vertical-Right
  DST(3, 1) =             AVG3(B, C, D);
 }

-static void VL4(uint8_t *dst) {   // Vertical-Left
+static void VL4(uint8_t* dst) {   // Vertical-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@ -354,7 +356,7 @@ static void VL4(uint8_t *dst) {   // Vertical-Left
              DST(3, 3) = AVG3(F, G, H);
 }

-static void HU4(uint8_t *dst) {   // Horizontal-Up
+static void HU4(uint8_t* dst) {   // Horizontal-Up
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -369,7 +371,7 @@ static void HU4(uint8_t *dst) {   // Horizontal-Up
    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static void HD4(uint8_t *dst) {  // Horizontal-Down
+static void HD4(uint8_t* dst) {  // Horizontal-Down
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -396,17 +398,19 @@ static void HD4(uint8_t *dst) {  // Horizontal-Down
 #undef AVG3
 #undef AVG2

+VP8PredFunc VP8PredLuma4[NUM_BMODES];
+
 //------------------------------------------------------------------------------
 // Chroma

-static void VE8uv(uint8_t *dst) {    // vertical
+static void VE8uv(uint8_t* dst) {    // vertical
  int j;
  for (j = 0; j < 8; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 8);
  }
 }

-static void HE8uv(uint8_t *dst) {    // horizontal
+static void HE8uv(uint8_t* dst) {    // horizontal
  int j;
  for (j = 0; j < 8; ++j) {
    memset(dst, dst[-1], 8);
@ -422,7 +426,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  }
 }

-static void DC8uv(uint8_t *dst) {     // DC
+static void DC8uv(uint8_t* dst) {     // DC
  int dc0 = 8;
  int i;
  for (i = 0; i < 8; ++i) {
@ -431,7 +435,7 @@ static void DC8uv(uint8_t *dst) {     // DC
  Put8x8uv(dc0 >> 4, dst);
 }

-static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@ -440,7 +444,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@ -449,26 +453,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
  Put8x8uv(0x80, dst);
 }

-//------------------------------------------------------------------------------
-// default C implementations
-
-const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
-  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
-};
-
-const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
-  DC16, TM16, VE16, HE16,
-  DC16NoTop, DC16NoLeft, DC16NoTopLeft
-};
-
-const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
-  DC8uv, TM8uv, VE8uv, HE8uv,
-  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
-};
+VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];

 //------------------------------------------------------------------------------
 // Edge filtering functions
@ -685,13 +674,15 @@ VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;

 extern void VP8DspInitSSE2(void);
+extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
+extern void VP8DspInitMIPSdspR2(void);

 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;

-void VP8DspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;

  VP8InitClipTables();
@ -716,20 +707,59 @@ void VP8DspInit(void) {
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;

+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[3] = HE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+  VP8PredLuma4[8] = HD4;
+  VP8PredLuma4[9] = HU4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8DspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8DspInitSSE41();
+      }
+#endif
    }
-#elif defined(WEBP_USE_NEON)
+#endif
+#if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8DspInitNEON();
    }
-#elif defined(WEBP_USE_MIPS32)
+#endif
+#if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8DspInitMIPSdspR2();
+    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
--- a/src/dsp/dec_clip_tables.c
+++ b/src/dsp/dec_clip_tables.c
@ -344,7 +344,7 @@ const int8_t* const VP8ksclip2 = &sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];

-void VP8InitClipTables(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
 #if !defined(USE_STATIC_TABLES)
  int i;
  if (!tables_ok) {
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -16,6 +16,8 @@

 #if defined(WEBP_USE_MIPS32)

+#include "./mips_macro.h"
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;

@ -52,6 +54,7 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
@ -68,9 +71,9 @@ static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
 }

-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) <= thresh);
+  return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
 }

 static WEBP_INLINE int needs_filter2(const uint8_t* p,
@ -78,7 +81,7 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
  const int p3 = p[-4 * step], p2 = p[-3 * step];
  const int p1 = p[-2 * step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
-  if ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) > t) {
+  if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
    return 0;
  }
  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
@ -89,8 +92,9 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -104,8 +108,9 @@ static WEBP_INLINE void FilterLoop26(uint8_t* p,
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -176,8 +181,9 @@ static void HFilter16i(uint8_t* p, int stride,

 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
+  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
+    if (needs_filter(p + i, stride, thresh2)) {
      do_filter2(p + i, stride);
    }
  }
@ -185,8 +191,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {

 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
+  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
      do_filter2(p + i * stride, 1);
    }
  }
@ -384,7 +391,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "sra      %[temp7],  %[temp7],  3                  \n\t"
    "sra      %[temp4],  %[temp4],  3                  \n\t"
    "addiu    %[temp6],  $zero,     255                \n\t"
-    "lbu      %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
    "sra      %[temp5],  %[temp1],  8                  \n\t"
    "sra      %[temp18], %[temp1],  31                 \n\t"
@ -392,8 +399,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
  "1:                                                  \n\t"
-    "lbu      %[temp18], 1(%[dst])                     \n\t"
-    "sb       %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
    "sra      %[temp11], %[temp18], 8                  \n\t"
    "sra      %[temp1],  %[temp18], 31                 \n\t"
@ -401,8 +408,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
  "2:                                                  \n\t"
-    "lbu      %[temp1],  2(%[dst])                     \n\t"
-    "sb       %[temp18], 1(%[dst])                     \n\t"
+    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
    "sra      %[temp8],  %[temp1],  8                  \n\t"
    "sra      %[temp18], %[temp1],  31                 \n\t"
@ -410,8 +417,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
  "3:                                                  \n\t"
-    "lbu      %[temp18], 3(%[dst])                     \n\t"
-    "sb       %[temp1],  2(%[dst])                     \n\t"
+    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
    "sra      %[temp16], %[temp18], 8                  \n\t"
    "sra      %[temp1],  %[temp18], 31                 \n\t"
@ -419,11 +426,11 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
  "4:                                                  \n\t"
-    "sb       %[temp18], 3(%[dst])                     \n\t"
-    "lbu      %[temp5],  32(%[dst])                    \n\t"
-    "lbu      %[temp8],  33(%[dst])                    \n\t"
-    "lbu      %[temp11], 34(%[dst])                    \n\t"
-    "lbu      %[temp16], 35(%[dst])                    \n\t"
+    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
@ -452,14 +459,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
  "8:                                                  \n\t"
-    "sb       %[temp5],  32(%[dst])                    \n\t"
-    "sb       %[temp8],  33(%[dst])                    \n\t"
-    "sb       %[temp11], 34(%[dst])                    \n\t"
-    "sb       %[temp16], 35(%[dst])                    \n\t"
-    "lbu      %[temp5],  64(%[dst])                    \n\t"
-    "lbu      %[temp8],  65(%[dst])                    \n\t"
-    "lbu      %[temp11], 66(%[dst])                    \n\t"
-    "lbu      %[temp16], 67(%[dst])                    \n\t"
+    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
@ -488,14 +495,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
  "12:                                                 \n\t"
-    "sb       %[temp5],  64(%[dst])                    \n\t"
-    "sb       %[temp8],  65(%[dst])                    \n\t"
-    "sb       %[temp11], 66(%[dst])                    \n\t"
-    "sb       %[temp16], 67(%[dst])                    \n\t"
-    "lbu      %[temp5],  96(%[dst])                    \n\t"
-    "lbu      %[temp8],  97(%[dst])                    \n\t"
-    "lbu      %[temp11], 98(%[dst])                    \n\t"
-    "lbu      %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
@ -524,10 +531,10 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
  "16:                                                 \n\t"
-    "sb       %[temp5],  96(%[dst])                    \n\t"
-    "sb       %[temp8],  97(%[dst])                    \n\t"
-    "sb       %[temp11], 98(%[dst])                    \n\t"
-    "sb       %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"

    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@ -548,15 +555,12 @@ static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  }
 }

-#endif  // WEBP_USE_MIPS32
-
 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8DspInitMIPS32(void);

-void VP8DspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
  VP8InitClipTables();

  VP8Transform = TransformTwo;
@ -574,5 +578,10 @@ void VP8DspInitMIPS32(void) {
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@ -0,0 +1,994 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    "lh               %[temp5],  0(%[in])               \n\t"
+    "addiu            %[temp5],  %[temp5],  4           \n\t"
+    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
+    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
+    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+                            temp3, temp1, temp2, temp3, temp4)
+    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_10()
+    : [in]"r"(in), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ins              %[c4],      %[d4],     16,       16    \n\t"
+    "replv.ph         %[temp1],   %[a]                       \n\t"
+    "replv.ph         %[temp4],   %[d1]                      \n\t"
+    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
+    "replv.ph         %[temp5],   %[c1]                      \n\t"
+    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+                            temp11, temp17, temp3, temp5, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+                          temp4, temp7, temp6, temp10, temp9)
+    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [c4]"+&r"(c4)
+    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
+    : "memory"
+  );
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15;
+
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+  "1:                                                    \n\t"
+    "negu      %[temp1],  %[hstride]                     \n\t"
+    "addiu     %[size],   %[size],        -1             \n\t"
+    "sll       %[temp2],  %[hstride],     1              \n\t"
+    "sll       %[temp3],  %[temp1],       1              \n\t"
+    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
+    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
+    "lbu       %[temp7],  0(%[p])                        \n\t"
+    "sll       %[temp6],  %[temp3],       1              \n\t"
+    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
+    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
+    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
+    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
+    "lbux      %[temp12], %[hstride](%[p])               \n\t"
+    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
+    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
+    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
+    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
+    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
+    "negu      %[temp1],  %[temp1]                       \n\t"
+    "sll       %[temp3],  %[temp3],       2              \n\t"
+    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
+    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
+    "sll       %[temp6],  %[temp1],       1              \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
+    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
+    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
+    "bgtz      %[temp4],  3f                             \n\t"
+    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
+    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
+    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "sra       %[temp4],  %[temp2],       24             \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
+    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
+    "absq_s.w  %[temp15], %[temp15]                      \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
+    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
+    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
+    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
+    "beqz      %[temp2],  4f                             \n\t"
+    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
+    "addiu     %[temp2],  %[temp5],       3              \n\t"
+    "sra       %[temp2],  %[temp2],       3              \n\t"
+    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
+    "sra       %[temp1],  %[temp1],       27             \n\t"
+    "sra       %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
+    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
+    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
+    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp3])                    \n\t"
+    "j         3f                                        \n\t"
+    " sb       %[temp1],  0(%[p])                        \n\t"
+  "4:                                                    \n\t"
+    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
+    "subu      %[temp14], %[p],           %[hstride]     \n\t"
+    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
+    "sra       %[temp6],  %[temp5],       24             \n\t"
+    "sll       %[temp1],  %[temp6],       3              \n\t"
+    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
+    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
+    "sll       %[temp3],  %[temp2],       1              \n\t"
+    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
+    "addiu     %[temp2],  %[temp2],       63             \n\t"
+    "addiu     %[temp3],  %[temp3],       63             \n\t"
+    "addiu     %[temp4],  %[temp4],       63             \n\t"
+    "sra       %[temp2],  %[temp2],       7              \n\t"
+    "sra       %[temp3],  %[temp3],       7              \n\t"
+    "sra       %[temp4],  %[temp4],       7              \n\t"
+    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
+    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
+    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
+    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
+    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
+    "addu      %[temp10], %[p],           %[hstride]     \n\t"
+    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
+    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
+    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
+    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
+    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
+    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
+    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
+    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp15])                   \n\t"
+    "sb        %[temp3],  0(%[temp11])                   \n\t"
+    "sb        %[temp4],  0(%[temp14])                   \n\t"
+    "sb        %[temp5],  0(%[p])                        \n\t"
+    "sb        %[temp6],  0(%[temp10])                   \n\t"
+    "sb        %[temp8],  0(%[temp12])                   \n\t"
+  "3:                                                    \n\t"
+    "bgtz      %[size],   1b                             \n\t"
+    " addu     %[p],      %[p],           %[vstride]     \n\t"
+    ".set      pop                                       \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
+      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
+      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
+      [size]"+&r"(size), [p]"+&r"(p)
+    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
+      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
+      [VP8kclip1]"r"(VP8kclip1)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  int p0, q0, p1, q1, p2, q2, p3, q3;
+  int step1, step2, temp1, temp2, temp3, temp4;
+  uint8_t* pTemp0;
+  uint8_t* pTemp1;
+  const int thresh2 = 2 * thresh + 1;
+
+  __asm__ volatile (
+    ".set      push                                   \n\t"
+    ".set      noreorder                              \n\t"
+    "bltz      %[size],    3f                         \n\t"
+    " nop                                             \n\t"
+  "2:                                                 \n\t"
+    "negu      %[step1],   %[hstride]                 \n\t"
+    "lbu       %[q0],      0(%[p])                    \n\t"
+    "lbux      %[p0],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q1],      %[hstride](%[p])           \n\t"
+    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
+    "lbux      %[p1],      %[step1](%[p])             \n\t"
+    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
+    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
+    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
+    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
+    "sll       %[temp2],   %[temp2],      2           \n\t"
+    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
+    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " lbux     %[p2],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q2],      %[step2](%[p])             \n\t"
+    "lbux      %[p3],      %[step1](%[p])             \n\t"
+    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
+    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
+    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
+    "lbux      %[q3],      %[step2](%[p])             \n\t"
+    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "negu      %[temp1],   %[temp1]                   \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "subu      %[p3],      %[p1],         %[p0]       \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " absq_s.w %[p3],      %[p3]                      \n\t"
+    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
+    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
+    "sll       %[step1],   %[temp1],      1           \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " absq_s.w %[temp2],   %[temp2]                   \n\t"
+    "subu      %[q3],      %[q1],         %[q0]       \n\t"
+    "absq_s.w  %[q3],      %[q3]                      \n\t"
+    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
+    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
+    "or        %[q3],      %[q3],         %[p3]       \n\t"
+    "bgtz      %[q3],      1f                         \n\t"
+    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addiu     %[step1],   %[temp2],      1           \n\t"
+    "sra       %[step1],   %[step1],      1           \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "addu      %[p1],      %[p1],         %[step1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "subu      %[q1],      %[q1],         %[step1]    \n\t"
+    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[pTemp0])               \n\t"
+    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
+    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
+    "sb        %[temp3],   0(%[p])                    \n\t"
+    "sb        %[temp4],   0(%[pTemp1])               \n\t"
+    "j         0f                                     \n\t"
+    " sb       %[temp1],   0(%[pTemp0])               \n\t"
+  "1:                                                 \n\t"
+    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
+    "sra       %[temp3],   %[temp3],      24          \n\t"
+    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
+    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[p])                    \n\t"
+    "sb        %[temp1],   0(%[pTemp0])               \n\t"
+  "0:                                                 \n\t"
+    "subu      %[size],    %[size],       1           \n\t"
+    "bgtz      %[size],    2b                         \n\t"
+    " addu     %[p],       %[p],          %[vstride]  \n\t"
+  "3:                                                 \n\t"
+    ".set      pop                                    \n\t"
+    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
+      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
+      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
+      [size]"+&r"(size)
+    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
+      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
+      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+#undef MUL
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  uint8_t* p1 = p - stride;
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+    "li        %[i],        16                           \n\t"
+  "0:                                                    \n\t"
+    "negu      %[temp4],    %[stride]                    \n\t"
+    "sll       %[temp5],    %[temp4],       1            \n\t"
+    "lbu       %[temp2],    0(%[p])                      \n\t"
+    "lbux      %[temp3],    %[stride](%[p])              \n\t"
+    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
+    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
+    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
+    "sll       %[temp4],    %[temp4],       2            \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
+    "negu      %[temp8],    %[temp7]                     \n\t"
+    "bgtz      %[temp5],    1f                           \n\t"
+    " addiu    %[i],        %[i],           -1           \n\t"
+    "sll       %[temp4],    %[temp8],       1            \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
+    "sra       %[temp5],    %[temp5],       24           \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
+    "addiu     %[temp7],    %[temp3],       3            \n\t"
+    "sra       %[temp7],    %[temp7],       3            \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
+    "sra       %[temp0],    %[temp0],       27           \n\t"
+    "sra       %[temp4],    %[temp4],       27           \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
+    "sb        %[temp3],    0(%[p1])                     \n\t"
+    "sb        %[temp4],    0(%[p])                      \n\t"
+  "1:                                                    \n\t"
+    "addiu     %[p1],       %[p1],          1            \n\t"
+    "bgtz      %[i],        0b                           \n\t"
+    " addiu    %[p],        %[p],           1            \n\t"
+    " .set     pop                                       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A + A1 * BPS]
+// TEMP1 = SRC[B + B1 * BPS]
+// TEMP2 = SRC[C + C1 * BPS]
+// TEMP3 = SRC[D + D1 * BPS]
+#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
+                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
+  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    ".set      push                                     \n\t"
+    ".set      noreorder                                \n\t"
+    "li        %[i],       16                           \n\t"
+  "0:                                                   \n\t"
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
+    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
+    "sll       %[temp4],    %[temp4],       2           \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
+    "negu      %[temp8],    %[temp7]                    \n\t"
+    "bgtz      %[temp5],    1f                          \n\t"
+    " addiu    %[i],        %[i],           -1          \n\t"
+    "sll       %[temp4],    %[temp8],       1           \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
+    "sra       %[temp5],    %[temp5],       24          \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
+    "addiu     %[temp7],    %[temp3],       3           \n\t"
+    "sra       %[temp7],    %[temp7],       3           \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
+    "sra       %[temp0],    %[temp0],       27          \n\t"
+    "sra       %[temp4],    %[temp4],       27          \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
+    "sb        %[temp3],    -1(%[p])                    \n\t"
+    "sb        %[temp4],    0(%[p])                     \n\t"
+  "1:                                                   \n\t"
+    "bgtz      %[i],        0b                          \n\t"
+    " addu     %[p],        %[p],           %[stride]   \n\t"
+    ".set      pop                                      \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+// DST[A * BPS]     = TEMP0
+// DST[B + C * BPS] = TEMP1
+#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
+  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
+  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile (
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
+    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC4(uint8_t* dst) {   // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  __asm__ volatile (
+    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
+    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
+    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
+    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
+    "replv.qb     %[temp0],   %[temp0]                 \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
+    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
+    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
+    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
+    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
+    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
+    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
+    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
+    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
+    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
+    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
+    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
+    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
+    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
+    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
+    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
+    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
+    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
+    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
+    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
+    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
+    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
+    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
+    "prepend        %[temp2],   %[temp8], 8                    \n\t"
+    "prepend        %[temp6],   %[temp4], 8                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A * BPS]
+// TEMP1 = SRC[B + C * BPS]
+#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
+  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
+  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
+    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
+    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
+    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
+    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
+    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
+    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
+    "srl             %[temp1],    %[temp1],    24              \n\t"
+    "sll             %[temp1],    %[temp1],    1               \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
+    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
+    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
+    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
+    "prepend         %[temp9],    %[temp0],    8               \n\t"
+    "prepend         %[temp3],    %[temp1],    8               \n\t"
+    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int temp0, temp1;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+#undef LOAD_8_BYTES
+#undef STORE_8_BYTES
+#undef LOAD_4_BYTES
+
+#define CLIPPING(SIZE)                                                         \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+".endif                                                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
+".endif                                                  \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+".endif                                                  \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
+".endif                                                  \n\t"
+
+
+#define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
+  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+  ".if " #SIZE " < 8                                     \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(4)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+  ".else                                                 \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, SIZE) do {                                            \
+  int y;                                                                       \
+  const uint8_t* top = (DST) - BPS;                                            \
+  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, SIZE)                                                 \
+static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
+  CLIP_TO_DST((DST), (SIZE));                                                  \
+}
+
+TRUE_MOTION(dst, 4)
+TRUE_MOTION(dst, 8)
+TRUE_MOTION(dst, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TrueMotion4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TrueMotion8;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+
+  VP8PredLuma16[1] = TrueMotion16;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -24,24 +24,24 @@

 // Load/Store vertical edge
 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"

 #define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"

 #if !defined(WORK_AROUND_GCC)

@ -389,9 +389,9 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,

 #endif  // !WORK_AROUND_GCC

-// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
-  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+// Zero extend 'v' to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(v));
 }

 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
@ -423,8 +423,8 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,

  {
    // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
+    const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));

    // Descale with rounding.
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
@ -479,6 +479,21 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {

 //------------------------------------------------------------------------------

+static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
+                               const int8x16_t delta,
+                               int8x16_t* const op0, int8x16_t* const oq0) {
+  const int8x16_t kCst3 = vdupq_n_s8(0x03);
+  const int8x16_t kCst4 = vdupq_n_s8(0x04);
+  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
+  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
+  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
+  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
+  *op0 = vqaddq_s8(p0s, delta3);
+  *oq0 = vqsubq_s8(q0s, delta4);
+}
+
+#if defined(WEBP_USE_INTRINSICS)
+
 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
                         const int8x16_t delta,
                         uint8x16_t* const op0, uint8x16_t* const oq0) {
@ -494,8 +509,6 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
  *oq0 = FlipSignBack(sq0);
 }

-#if defined(USE_INTRINSICS)
-
 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
                      const uint8x16_t q0, const uint8x16_t q1,
                      const uint8x16_t mask,
@ -626,7 +639,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  );
 }

-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS

 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  uint32_t k;
@ -721,11 +734,7 @@ static void DoFilter4(
    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
    const int8x16_t simple_lf_delta =
        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
-    uint8x16_t tmp_p0, tmp_q0;
-    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
-    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
-    p0s = FlipSign(tmp_p0);
-    q0s = FlipSign(tmp_q0);
+    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
  }

  // do_filter4 part (complex loopfilter on pixels without hev)
@ -797,11 +806,7 @@ static void DoFilter6(
  {
    const int8x16_t simple_lf_delta =
        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
-    uint8x16_t tmp_p0, tmp_q0;
-    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
-    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
-    p0s = FlipSign(tmp_p0);
-    q0s = FlipSign(tmp_q0);
+    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
  }

  // do_filter6 part (complex loopfilter on pixels without hev)
@ -986,7 +991,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 static const int16_t kC1 = 20091;
 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.

-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)
 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
                                     int16x8x2_t* const out) {
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
@ -1163,7 +1168,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  );
 }

-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
@ -1241,7 +1246,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  static const int kC1_full = 20091 + (1 << 16);
  static const int kC2_full = 35468;
-  const int16x4_t A = vdup_n_s16(in[0]);
+  const int16x4_t A = vld1_dup_s16(in);
  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
  const int c1 = MUL(in[1], kC2_full);
@ -1258,15 +1263,330 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 }
 #undef MUL

-#endif   // WEBP_USE_NEON
+//------------------------------------------------------------------------------
+// 4x4
+
+static void DC4(uint8_t* dst) {    // DC
+  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+  const uint16x4_t p1 = vpadd_u16(p0, p0);
+  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+  const uint16x8_t s0 = vaddq_u16(L0, L1);
+  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint16x8_t s01 = vaddq_u16(s0, s1);
+  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
+  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
+  }
+}
+
+// TrueMotion (4x4 + 8x8)
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
+  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
+  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
+  int y;
+  for (y = 0; y < size; y += 4) {
+    // left edge
+    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1 = vaddq_s16(L1, d);
+    const int16x8_t r2 = vaddq_s16(L2, d);
+    const int16x8_t r3 = vaddq_s16(L3, d);
+    // Saturate and store the result.
+    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
+    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
+    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
+    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
+    if (size == 4) {
+      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
+    } else {
+      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
+      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
+      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
+      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
+    }
+    dst += 4 * BPS;
+  }
+}
+
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+
+static void VE4(uint8_t* dst) {    // vertical
+  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
+  }
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32_t I = dst[-1 + 0 * BPS];
+  const uint32_t J = dst[-1 + 1 * BPS];
+  const uint32_t K = dst[-1 + 2 * BPS];
+  const uint32_t L = dst[-1 + 3 * BPS];
+  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+static void LD4(uint8_t* dst) {    // Down-left
+  // Note using the same shift trick as VE4() is slower here.
+  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
+  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
+  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
+  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  const uint8x8_t top = vld1_u8(dst - BPS);
+  int j;
+  for (j = 0; j < 8; ++j) {
+    vst1_u8(dst + j * BPS, top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const uint8x8_t left = vld1_dup_u8(dst - 1);
+    vst1_u8(dst, left);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_top) {
+    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
+    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
+    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
+    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
+    const uint16x8_t s0 = vaddq_u16(L0, L1);
+    const uint16x8_t s1 = vaddq_u16(L2, L3);
+    const uint16x8_t s2 = vaddq_u16(L4, L5);
+    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint16x8_t s01 = vaddq_u16(s0, s1);
+    const uint16x8_t s23 = vaddq_u16(s2, s3);
+    sum_left = vaddq_u16(s01, s23);
+  }
+
+  if (do_top && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_top) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
+static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
+static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
+static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+
+//------------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t* dst) {     // vertical
+  const uint8x16_t top = vld1q_u8(dst - BPS);
+  int j;
+  for (j = 0; j < 16; ++j) {
+    vst1q_u8(dst + j * BPS, top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 0; j < 16; ++j) {
+    const uint8x16_t left = vld1q_dup_u8(dst - 1);
+    vst1q_u8(dst, left);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_top) {
+    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    int i;
+    sum_left = vdupq_n_u16(0);
+    for (i = 0; i < 16; i += 8) {
+      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
+      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
+      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
+      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
+      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
+      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
+      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
+      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
+      const uint16x8_t s0 = vaddq_u16(L0, L1);
+      const uint16x8_t s1 = vaddq_u16(L2, L3);
+      const uint16x8_t s2 = vaddq_u16(L4, L5);
+      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint16x8_t s01 = vaddq_u16(s0, s1);
+      const uint16x8_t s23 = vaddq_u16(s2, s3);
+      const uint16x8_t sum = vaddq_u16(s01, s23);
+      sum_left = vaddq_u16(sum_left, sum);
+    }
+  }
+
+  if (do_top && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_top) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * BPS, dc);
+    }
+  }
+}
+
+static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
+static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
+static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
+static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+
+static void TM16(uint8_t* dst) {
+  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
+  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
+  // A[c] - A[-1]
+  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
+  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
+  int y;
+  for (y = 0; y < 16; y += 4) {
+    // left edge
+    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
+    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
+    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
+    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
+    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
+    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
+    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
+    // Saturate and store the result.
+    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
+    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
+    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
+    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
+    vst1q_u8(dst + 0 * BPS, row0);
+    vst1q_u8(dst + 1 * BPS, row1);
+    vst1q_u8(dst + 2 * BPS, row2);
+    vst1q_u8(dst + 3 * BPS, row3);
+    dst += 4 * BPS;
+  }
+}

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8DspInitNEON(void);

-void VP8DspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
  VP8Transform = TransformTwo;
  VP8TransformAC3 = TransformAC3;
  VP8TransformDC = TransformDC;
@ -1288,5 +1608,32 @@ void VP8DspInitNEON(void) {
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif   // WEBP_USE_NEON
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredLuma16[0] = DC16TopLeft;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8DspInitNEON)
+
+#endif  // WEBP_USE_NEON
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -52,19 +52,19 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
  // vectors will just contain random value we'll never use nor store.
  __m128i in0, in1, in2, in3;
  {
-    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
-    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
-    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
-    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
    // a00 a10 a20 a30   x x x x
    // a01 a11 a21 a31   x x x x
    // a02 a12 a22 a32   x x x x
    // a03 a13 a23 a33   x x x x
    if (do_two) {
-      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
-      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
-      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
-      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
      in0 = _mm_unpacklo_epi64(in0, inB0);
      in1 = _mm_unpacklo_epi64(in1, inB1);
      in2 = _mm_unpacklo_epi64(in2, inB2);
@ -207,10 +207,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
    } else {
      // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
@ -236,10 +236,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
    } else {
      // Store four bytes/pixels per line.
-      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
    }
  }
 }
@ -262,10 +262,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const __m128i m3 = _mm_subs_epi16(B, d4);
  const __m128i zero = _mm_setzero_si128();
  // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
  // Convert to 16b.
  dst0 = _mm_unpacklo_epi8(dst0, zero);
  dst1 = _mm_unpacklo_epi8(dst1, zero);
@ -282,10 +282,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  dst2 = _mm_packus_epi16(dst2, dst2);
  dst3 = _mm_packus_epi16(dst3, dst3);
  // Store the results.
-  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
 #undef MUL
 #endif   // USE_TRANSFORM_AC3
@ -301,11 +301,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
 static WEBP_INLINE void SignedShift8b(__m128i* const x) {
  const __m128i zero = _mm_setzero_si128();
-  const __m128i signs = _mm_cmpgt_epi8(zero, *x);
-  const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
-  const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
-  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
-  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
+  const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
+  const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
+  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3 + 8);
+  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3 + 8);
  *x = _mm_packs_epi16(lo_1, hi_1);
 }

@ -330,11 +329,10 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
  const __m128i t_2 = MM_ABS(*q1, *q0);

  const __m128i h = _mm_set1_epi8(hev_thresh);
-  const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
-  const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh
+  const __m128i t_max = _mm_max_epu8(t_1, t_2);

-  *not_hev = _mm_or_si128(t_3, t_4);
-  *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
+  const __m128i t_max_h = _mm_subs_epu8(t_max, h);
+  *not_hev = _mm_cmpeq_epi8(t_max_h, zero);  // not_hev <= t1 && not_hev <= t2
 }

 // input pixels are int8_t
@ -428,9 +426,11 @@ static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
 static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
-  const __m128i k64 = _mm_set1_epi8(0x40);
  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i k64 = _mm_set1_epi8(64);
+  const __m128i k3 = _mm_set1_epi8(3);
+  const __m128i k4 = _mm_set1_epi8(4);
  __m128i not_hev;
  __m128i t1, t2, t3;

@ -448,10 +448,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

-  t2 = _mm_set1_epi8(3);
-  t3 = _mm_set1_epi8(4);
-  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
-  t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
+  t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
@ -520,47 +518,31 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
 }

 // reads 8 rows across a vertical edge.
-//
-// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
-// two Load4x4() to avoid code duplication.
 static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
                                __m128i* const p, __m128i* const q) {
-  __m128i t1, t2;
+  // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
+  // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
+  const __m128i A0 = _mm_set_epi32(
+      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
+      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
+  const __m128i A1 = _mm_set_epi32(
+      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
+      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));

-  // Load 0th, 1st, 4th and 5th rows
-  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
-  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
-  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
-  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50
+  // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+  // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);

-  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
-  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10
-
-  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-  t1 = _mm_unpacklo_epi8(r0, r1);
-
-  // Load 2nd, 3rd, 6th and 7th rows
-  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
-  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
-  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
-  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70
-
-  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
-  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30
-
-  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-  t2 = _mm_unpacklo_epi8(r0, r1);
-
-  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-  r0 = t1;
-  t1 = _mm_unpacklo_epi16(t1, t2);
-  t2 = _mm_unpackhi_epi16(r0, t2);
+  // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+  // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+  const __m128i C0 = _mm_unpacklo_epi16(B0, B1);
+  const __m128i C1 = _mm_unpackhi_epi16(B0, B1);

  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-  *p = _mm_unpacklo_epi32(t1, t2);
-  *q = _mm_unpackhi_epi32(t1, t2);
+  *p = _mm_unpacklo_epi32(C0, C1);
+  *q = _mm_unpackhi_epi32(C0, C1);
 }

 static WEBP_INLINE void Load16x4(const uint8_t* const r0,
@ -568,7 +550,6 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
                                 int stride,
                                 __m128i* const p1, __m128i* const p0,
                                 __m128i* const q0, __m128i* const q1) {
-  __m128i t1, t2;
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
@ -587,22 +568,24 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
  Load8x4(r0, stride, p1, q0);
  Load8x4(r8, stride, p0, q1);

-  t1 = *p1;
-  t2 = *q0;
-  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-  *p1 = _mm_unpacklo_epi64(t1, *p0);
-  *p0 = _mm_unpackhi_epi64(t1, *p0);
-  *q0 = _mm_unpacklo_epi64(t2, *q1);
-  *q1 = _mm_unpackhi_epi64(t2, *q1);
+  {
+    // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+    // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+    // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+    // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+    const __m128i t1 = *p1;
+    const __m128i t2 = *q0;
+    *p1 = _mm_unpacklo_epi64(t1, *p0);
+    *p0 = _mm_unpackhi_epi64(t1, *p0);
+    *q0 = _mm_unpacklo_epi64(t2, *q1);
+    *q1 = _mm_unpackhi_epi64(t2, *q1);
+  }
 }

 static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
-    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
+    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
    *x = _mm_srli_si128(*x, 4);
  }
 }
@ -947,15 +930,308 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
 }

-#endif   // WEBP_USE_SSE2
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+
+// We use the following 8b-arithmetic tricks:
+//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
+//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static void VE4(uint8_t* dst) {    // vertical
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+  const __m128i b = _mm_subs_epu8(a, lsb);
+  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    WebPUint32ToMem(dst + i * BPS, vals);
+  }
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static void VR4(uint8_t* dst) {   // Vertical-Right
+  const __m128i one = _mm_set1_epi8(1);
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int X = dst[-1 - BPS];
+  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+  // these two are hard to implement in SSE2, so we keep the C-version:
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+}
+
+static void VL4(uint8_t* dst) {   // Vertical-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+  const __m128i abbc = _mm_or_si128(ab, bc);
+  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+  // these two are hard to get and irregular
+  DST(3, 2) = (extra_out >> 0) & 0xff;
+  DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
+  const uint32_t I = dst[-1 + 0 * BPS];
+  const uint32_t J = dst[-1 + 1 * BPS];
+  const uint32_t K = dst[-1 + 2 * BPS];
+  const uint32_t L = dst[-1 + 3 * BPS];
+  const __m128i LKJI_____ =
+      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+  const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
+  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+#undef DST
+#undef AVG3
+
+//------------------------------------------------------------------------------
+// Luma 16x16
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 4) {
+    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 4; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+    }
+  } else if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_loadu_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_storeu_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+
+static void VE16(uint8_t* dst) {
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC16(uint8_t* dst) {    // DC
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  int left = 0;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    left += dst[-1 + j * BPS];
+  }
+  {
+    const int DC = _mm_cvtsi128_si32(sum) + left + 16;
+    Put16(DC >> 5, dst);
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  const int DC = _mm_cvtsi128_si32(sum) + 8;
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  int j;
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC8uv(uint8_t* dst) {     // DC
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  const __m128i sum = _mm_sad_epu8(top, zero);
+  int left = 0;
+  int j;
+  for (j = 0; j < 8; ++j) {
+    left += dst[-1 + j * BPS];
+  }
+  {
+    const int DC = _mm_cvtsi128_si32(sum) + left + 8;
+    Put8x8uv(DC >> 4, dst);
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  const __m128i sum = _mm_sad_epu8(top, zero);
+  const int DC = _mm_cvtsi128_si32(sum) + 4;
+  Put8x8uv(DC >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
+}

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8DspInitSSE2(void);

-void VP8DspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
  VP8Transform = Transform;
 #if defined(USE_TRANSFORM_AC3)
  VP8TransformAC3 = TransformAC3;
@ -974,5 +1250,33 @@ void VP8DspInitSSE2(void) {
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif   // WEBP_USE_SSE2
+
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/dec_sse41.c
+++ b/src/dsp/dec_sse41.c
@ -0,0 +1,45 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some decoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <smmintrin.h>
+#include "../dec/vp8i.h"
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  const __m128i kShuffle3 = _mm_set1_epi8(3);
+  for (j = 16; j > 0; --j) {
+    const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
+    const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
+  VP8PredLuma16[3] = HE16;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -14,16 +14,15 @@
 #ifndef WEBP_DSP_DSP_H_
 #define WEBP_DSP_DSP_H_

-#ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
-#endif
-
 #include "../webp/types.h"
+#include "../utils/utils.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

+#define BPS 32   // this is the common stride for enc/dec
+
 //------------------------------------------------------------------------------
 // CPU detection

@ -36,20 +35,20 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif

-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif  // __clang__
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif

 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif

+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
@ -58,6 +57,10 @@ extern "C" {
 #define WEBP_USE_SSE2
 #endif

+#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#define WEBP_USE_SSE41
+#endif
+
 #if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
 #define WEBP_USE_AVX2
 #endif
@ -73,24 +76,53 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif

-#if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
+#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
 #endif
 #endif

 typedef enum {
  kSSE2,
  kSSE3,
+  kSSE4_1,
  kAVX,
  kAVX2,
  kNEON,
-  kMIPS32
+  kMIPS32,
+  kMIPSdspR2
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo VP8GetCPUInfo;
+WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Init stub generator
+
+// Defines an init function stub to ensure each module exposes a symbol,
+// avoiding a compiler warning.
+#define WEBP_DSP_INIT_STUB(func) \
+  extern void func(void); \
+  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}

 //------------------------------------------------------------------------------
 // Encoding
@ -104,6 +136,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
 typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
+extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
@ -122,26 +155,63 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;

 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
+extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                const struct VP8Matrix* const mtx);
+// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
+typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
+                                  const struct VP8Matrix* const mtx);
+
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
+extern VP8Quantize2Blocks VP8EncQuantize2Blocks;

 // specific to 2nd transform:
 typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
                                   const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;

-// Collect histogram for susceptibility calculation and accumulate in histo[].
-struct VP8Histogram;
+extern const int VP8DspScan[16 + 4 + 4];
+
+// Collect histogram for susceptibility calculation.
+#define MAX_COEFF_THRESH   31   // size of histogram used by CollectHistogram.
+typedef struct {
+  // We only need to store max_value and last_non_zero, not the distribution.
+  int max_value;
+  int last_non_zero;
+} VP8Histogram;
 typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
                          int start_block, int end_block,
-                          struct VP8Histogram* const histo);
-extern const int VP8DspScan[16 + 4 + 4];
+                          VP8Histogram* const histo);
 extern VP8CHisto VP8CollectHistogram;
+// General-purpose util function to help VP8CollectHistogram().
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo);

-void VP8EncDspInit(void);   // must be called before using any of the above
+// must be called before using any of the above
+void VP8EncDspInit(void);
+
+//------------------------------------------------------------------------------
+// cost functions (encoding)
+
+extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
+// approximate cost per level:
+extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
+extern const uint8_t VP8EncBands[16 + 1];
+
+struct VP8Residual;
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+                                         struct VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+// Cost calculation function.
+typedef int (*VP8GetResidualCostFunc)(int ctx0,
+                                      const struct VP8Residual* const res);
+extern VP8GetResidualCostFunc VP8GetResidualCost;
+
+// must be called before anything using the above
+void VP8EncDspCostInit(void);

 //------------------------------------------------------------------------------
 // Decoding
@ -159,16 +229,17 @@ extern VP8WHT VP8TransformWHT;
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
-extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];

 // clipping tables (for filtering)
 extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
 extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
 extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
 extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
-void VP8InitClipTables(void);           // must be called first
+// must be called first
+void VP8InitClipTables(void);

 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
@ -240,13 +311,81 @@ typedef void (*WebPYUV444Converter)(const uint8_t* y,
                                    const uint8_t* u, const uint8_t* v,
                                    uint8_t* dst, int len);

-extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];

 // Must be called before using the WebPUpsamplers[] (and for premultiplied
 // colorspaces like rgbA, rgbA4444, etc)
 void WebPInitUpsamplers(void);
 // Must be called before using WebPSamplers[]
 void WebPInitSamplers(void);
+// Must be called before using WebPYUV444Converters[]
+void WebPInitYUV444Converters(void);
+
+//------------------------------------------------------------------------------
+// ARGB -> YUV converters
+
+// Convert ARGB samples to luma Y.
+extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
+// even lines and '0' for odd ones. 'src_width' is the original width, not
+// the U/V one.
+extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                   int src_width, int do_store);
+
+// Convert a row of accumulated (four-values) of rgba32 toward U/V
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+                                     uint8_t* u, uint8_t* v, int width);
+
+// Convert RGB or BGR to Y
+extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+
+// used for plain-C fallback.
+extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                  int src_width, int do_store);
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+                                    uint8_t* u, uint8_t* v, int width);
+
+// Must be called before using the above.
+void WebPInitConvertARGBToYUV(void);
+
+//------------------------------------------------------------------------------
+// Rescaler
+
+struct WebPRescaler;
+
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+
+extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
+
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(struct WebPRescaler* const wrk);
+extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
+
+// Plain-C implementation, as fall-back.
+extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+
+// Main entry calls:
+extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
+                                  const uint8_t* src);
+// Export one row (starting at x_out position) from rescaler.
+extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
+
+// Must be called first before using the above.
+void WebPRescalerDspInit(void);

 //------------------------------------------------------------------------------
 // Utilities for processing transparent channel.
@ -260,6 +399,18 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
    uint8_t* rgba4444, int w, int h, int stride);

+// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
+// Returns true if alpha[] plane has non-trivial values different from 0xff.
+extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
+                                int width, int height,
+                                uint8_t* dst, int dst_stride);
+
+// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
+// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
+                                        int width, int height,
+                                        uint32_t* dst, int dst_stride);
+
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
@ -286,9 +437,59 @@ void WebPMultRows(uint8_t* ptr, int stride,
                  const uint8_t* alpha, int alpha_stride,
                  int width, int num_rows, int inverse);

+// Plain-C versions, used as fallback by some implementations.
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse);
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
+
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);

+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
+                           const uint8_t* g, const uint8_t* b, int len,
+                           uint32_t* out);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                          int len, int step, uint32_t* out);
+
+// To be called first before using the above.
+void VP8EncDspARGBInit(void);
+
+//------------------------------------------------------------------------------
+// Filter functions
+
+typedef enum {     // Filter types.
+  WEBP_FILTER_NONE = 0,
+  WEBP_FILTER_HORIZONTAL,
+  WEBP_FILTER_VERTICAL,
+  WEBP_FILTER_GRADIENT,
+  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
+  WEBP_FILTER_BEST,    // meta-types
+  WEBP_FILTER_FAST
+} WEBP_FILTER_TYPE;
+
+typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+                               int stride, uint8_t* out);
+typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
+                                 int row, int num_rows, uint8_t* data);
+
+// Filter the given data using the given predictor.
+// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
+// in raster order.
+// 'stride' is number of bytes per scan line (with possible padding).
+// 'out' should be pre-allocated.
+extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+
+// In-place reconstruct the original data from the given filtered data.
+// The reconstruction will be done for 'num_rows' rows starting from 'row'
+// (assuming rows upto 'row - 1' are already reconstructed).
+extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+// To be called first before using the above.
+void VP8FiltersInit(void);
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -40,10 +40,27 @@ const int VP8DspScan[16 + 4 + 4] = {
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };

+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  histo->max_value = max_value;
+  histo->last_non_zero = last_non_zero;
+}
+
 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int k;
    int16_t out[16];
@ -54,9 +71,10 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
    for (k = 0; k < 16; ++k) {
      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
-      histo->distribution[clipped_value]++;
+      ++distribution[clipped_value];
    }
  }
+  VP8SetHistogramData(distribution, histo);
 }

 //------------------------------------------------------------------------------
@ -68,7 +86,7 @@ static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;

-static void InitTables(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255 + 255; ++i) {
@ -159,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  }
 }

+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  VP8FTransform(src, ref, out);
+  VP8FTransform(src + 4, ref + 4, out + 16);
+}
+
 static void FTransformWHT(const int16_t* in, int16_t* out) {
  // input is 12b signed
  int32_t tmp[16];
@ -195,8 +218,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
 // Intra predictions

-#define DST(x, y) dst[(x) + (y) * BPS]
-
 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  int j;
  for (j = 0; j < size; ++j) {
@ -207,7 +228,7 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
 static WEBP_INLINE void VerticalPred(uint8_t* dst,
                                     const uint8_t* top, int size) {
  int j;
-  if (top) {
+  if (top != NULL) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
  } else {
    Fill(dst, 127, size);
@ -216,7 +237,7 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,

 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
                                       const uint8_t* left, int size) {
-  if (left) {
+  if (left != NULL) {
    int j;
    for (j = 0; j < size; ++j) {
      memset(dst + j * BPS, left[j], size);
@ -229,8 +250,8 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
                                   const uint8_t* top, int size) {
  int y;
-  if (left) {
-    if (top) {
+  if (left != NULL) {
+    if (top != NULL) {
      const uint8_t* const clip = clip1 + 255 - left[-1];
      for (y = 0; y < size; ++y) {
        const uint8_t* const clip_table = clip + left[y];
@ -248,7 +269,7 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
    // is equivalent to VE prediction where you just copy the top samples.
    // Note that if top samples are not available, the default value is
    // then 129, and not 127 as in the VerticalPred case.
-    if (top) {
+    if (top != NULL) {
      VerticalPred(dst, top, size);
    } else {
      Fill(dst, 129, size);
@ -261,15 +282,15 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
                               int size, int round, int shift) {
  int DC = 0;
  int j;
-  if (top) {
+  if (top != NULL) {
    for (j = 0; j < size; ++j) DC += top[j];
-    if (left) {   // top and left present
+    if (left != NULL) {   // top and left present
      for (j = 0; j < size; ++j) DC += left[j];
    } else {      // top, but no left
      DC += DC;
    }
    DC = (DC + round) >> shift;
-  } else if (left) {   // left but no top
+  } else if (left != NULL) {   // left but no top
    for (j = 0; j < size; ++j) DC += left[j];
    DC += DC;
    DC = (DC + round) >> shift;
@ -291,8 +312,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  TrueMotion(C8TM8 + dst, left, top, 8);
  // V block
  dst += 8;
-  if (top) top += 8;
-  if (left) left += 16;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
  HorizontalPred(C8HE8 + dst, left, 8);
@ -313,6 +334,7 @@ static void Intra16Preds(uint8_t* dst,
 //------------------------------------------------------------------------------
 // luma 4x4 prediction

+#define DST(x, y) dst[(x) + (y) * BPS]
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

@ -335,10 +357,10 @@ static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }

 static void DC4(uint8_t* dst, const uint8_t* top) {
@ -625,6 +647,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                            const VP8Matrix* const mtx) {
  int n, last = -1;
@ -654,16 +684,22 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
 //------------------------------------------------------------------------------
 // Block copy

-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  int y;
-  for (y = 0; y < size; ++y) {
-    memcpy(dst, src, size);
+  for (y = 0; y < h; ++y) {
+    memcpy(dst, src, w);
    src += BPS;
    dst += BPS;
  }
 }

-static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 16, 8);
+}

 //------------------------------------------------------------------------------
 // Initialization
@ -673,6 +709,7 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 VP8CHisto VP8CollectHistogram;
 VP8Idct VP8ITransform;
 VP8Fdct VP8FTransform;
+VP8Fdct VP8FTransform2;
 VP8WHT VP8FTransformWHT;
 VP8Intra4Preds VP8EncPredLuma4;
 VP8IntraPreds VP8EncPredLuma16;
@ -684,18 +721,22 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;

 extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
 extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);

 static volatile VP8CPUInfo enc_last_cpuinfo_used =
    (VP8CPUInfo)&enc_last_cpuinfo_used;

-void VP8EncDspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;

  VP8DspInit();  // common inverse transforms
@ -705,6 +746,7 @@ void VP8EncDspInit(void) {
  VP8CollectHistogram = CollectHistogram;
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
+  VP8FTransform2 = FTransform2;
  VP8FTransformWHT = FTransformWHT;
  VP8EncPredLuma4 = Intra4Preds;
  VP8EncPredLuma16 = Intra16Preds;
@ -716,14 +758,21 @@ void VP8EncDspInit(void) {
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8Copy4x4 = Copy4x4;
+  VP8Copy16x8 = Copy16x8;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8EncDspInitSSE41();
+      }
+#endif
    }
 #endif
 #if defined(WEBP_USE_AVX2)
@ -740,8 +789,12 @@ void VP8EncDspInit(void) {
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspInitMIPSdspR2();
+    }
 #endif
  }
  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
-
--- a/src/dsp/enc_avx2.c
+++ b/src/dsp/enc_avx2.c
@ -18,7 +18,4 @@
 //------------------------------------------------------------------------------
 // Entry point

-extern void VP8EncDspInitAVX2(void);
-
-void VP8EncDspInitAVX2(void) {
-}
+WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2)
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -17,13 +17,10 @@

 #if defined(WEBP_USE_MIPS32)

+#include "./mips_macro.h"
 #include "../enc/vp8enci.h"
 #include "../enc/cost.h"

-#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
-#define WORK_AROUND_GCC
-#endif
-
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;

@ -34,86 +31,86 @@ static const int kC2 = 35468;
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
 #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
-  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
-  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
-  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
-  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
-  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
-  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
-  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
-  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
-  "sra     %[temp18],      %[temp18],      16              \n\n"            \
-  "sra     %[temp17],      %[temp17],      16              \n\n"            \
-  "sra     %[temp19],      %[temp19],      16              \n\n"            \
-  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
-  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
-  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"

 // macro for one horizontal pass in ITransformOne
 // MUL and STORE macros inlined
 // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from ref and store to dst buffer
+// A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
-  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
-  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
-  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
-  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
-  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
-  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                      \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
-  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
-  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
-  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
-  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
-  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
-  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
-  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255             \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
-  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                      \n\t"            \
-  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \
+  "lw      %[temp20],      0(%[args])                          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \
+  "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \
+  "addiu   %[temp20],      $zero,          255                 \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \
+  "lw      %[temp16],      8(%[args])                          \n\t"          \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \
+  "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"

 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@ -130,10 +127,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)

-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
+    HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)

    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@ -164,9 +161,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 #define QUANTIZE_ONE(J, K, N)                                               \
-  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
  "sra          %[sign],        %[temp0],           15              \n\t"   \
  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
@ -175,9 +172,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "addiu        %[temp5],       $zero,              0               \n\t"   \
  "addiu        %[level],       $zero,              0               \n\t"   \
  "beqz         %[temp4],       2f                                  \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
-  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
  "sra          %[level],       %[level],           17              \n\t"   \
@ -187,8 +184,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "subu         %[level],       %[level],           %[sign]         \n\t"   \
  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
 "2:                                                                 \n\t"   \
-  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
-  "sh           %[level],       "#N"(%[pout])                       \n\t"
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"

 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
@ -241,46 +238,54 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return 0;
 }

+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 #undef QUANTIZE_ONE

 // macro for one horizontal pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
-// A..D - offsets in bytes to load from a and b buffers
+// A - offset in bytes to load from a and b buffers
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
-#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
-  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
-  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
-  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
-  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
-  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
-  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
-  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
-  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
-  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
-  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
-  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
-  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
-  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
-  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
-  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
-  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
-  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
-  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
-  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
-  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
-  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
-  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \
+  "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \
+  "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])               \n\t"

 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@ -295,10 +300,10 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // A1..D1 - offsets in bytes to load second results from tmp buffer
 // E..H - offsets in bytes to load from w buffer
 #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
-  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
-  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
-  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
-  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -319,18 +324,18 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
-  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
-  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
-  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
-  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
  "madd   %[temp4],  %[temp3]                \n\t"                \
  "madd   %[temp5],  %[temp1]                \n\t"                \
  "madd   %[temp6],  %[temp0]                \n\t"                \
  "madd   %[temp7],  %[temp8]                \n\t"                \
-  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
-  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
-  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
-  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -362,10 +367,10 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

  __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
-    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
-    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
-    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
+    HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124)
    "mthi   $zero                             \n\t"
    "mtlo   $zero                             \n\t"
    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
@ -405,73 +410,73 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

 // macro for one horizontal pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from src and ref buffers
+// A - offset in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
-  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
-  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
-  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
-  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
-  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
-  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
-  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
-  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
-  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
-  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \
+  "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \
+  "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \
+  "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \
+  "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                      \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t"

 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to store to out buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
-  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
-  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
-  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
-  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
-  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
-  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
-  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
-  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
-  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
-  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
-  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
-  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
-  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -483,10 +488,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
      { (const int*)src, (const int*)ref, (const int*)out };

  __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
    "lw   %[temp20],    8(%[args])                     \n\t"
    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
@ -508,128 +513,17 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS

-// Forward declaration.
-extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
-
-int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  int cost;
-  const int const_2 = 2;
-  const int const_255 = 255;
-  const int const_max_level = MAX_VARIABLE_LEVEL;
-  int res_cost;
-  int res_prob;
-  int res_coeffs;
-  int res_last;
-  int v_reg;
-  int b_reg;
-  int ctx_reg;
-  int cost_add, temp_1, temp_2, temp_3;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-
-  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-
-  res_cost = (int)res->cost;
-  res_prob = (int)res->prob;
-  res_coeffs = (int)res->coeffs;
-  res_last = (int)res->last;
-
-  __asm__ volatile(
-    ".set   push                                                           \n\t"
-    ".set   noreorder                                                      \n\t"
-
-    "sll    %[temp_1],     %[n],              1                            \n\t"
-    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
-    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_2],     1f                                              \n\t"
-    " li    %[cost_add],   0                                               \n\t"
-    "b      2f                                                             \n\t"
-    " nop                                                                  \n\t"
-  "1:                                                                      \n\t"
-    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
-    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
-    "move   %[temp_1],     %[const_max_level]                              \n\t"
-    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
-    "negu   %[temp_2],     %[v_reg]                                        \n\t"
-    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
-    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
-    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
-    "li     %[cost_add],   0                                               \n\t"
-
-    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
-    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
-    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
-    //  cost += VP8LevelCost(t, v);
-    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
-    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
-    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
-    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
-    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
-    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
-    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
-
-    //  t = res->cost[b][ctx];
-    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
-    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
-    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
-    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
-    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
-    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
-    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
-    "addiu  %[n],          %[n],              1                            \n\t"
-    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
-    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_1],     1b                                              \n\t"
-    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
-   "2:                                                                     \n\t"
-
-    ".set   pop                                                            \n\t"
-    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
-      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
-      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
-    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
-      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
-      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
-      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
-      [res_cost]"r"(res_cost)
-    : "memory"
-  );
-
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
+#if !defined(WORK_AROUND_GCC)

 #define GET_SSE_INNER(A, B, C, D)                               \
-  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
-  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
-  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
-  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
-  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
-  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
-  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
-  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
@ -645,7 +539,6 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)

-#if !defined(WORK_AROUND_GCC)
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -653,29 +546,29 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
-     GET_SSE(128, 132, 136, 140)
-     GET_SSE(144, 148, 152, 156)
-     GET_SSE(160, 164, 168, 172)
-     GET_SSE(176, 180, 184, 188)
-     GET_SSE(192, 196, 200, 204)
-     GET_SSE(208, 212, 216, 220)
-     GET_SSE(224, 228, 232, 236)
-     GET_SSE(240, 244, 248, 252)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }
@ -687,21 +580,21 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }
@ -713,17 +606,17 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE( 0,   4,  16,  20)
-     GET_SSE(32,  36,  48,  52)
-     GET_SSE(64,  68,  80,  84)
-     GET_SSE(96, 100, 112, 116)
+     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }
@ -735,42 +628,45 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE(0, 16, 32, 48)
+     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }

-#endif  // WORK_AROUND_GCC
+#undef GET_SSE
+#undef GET_SSE_INNER

-#undef GET_SSE_MIPS32
-#undef GET_SSE_MIPS32_INNER
-
-#endif  // WEBP_USE_MIPS32
+#endif  // !WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8EncDspInitMIPS32(void);

-void VP8EncDspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
-  VP8FTransform = FTransform;
 #if !defined(WORK_AROUND_GCC)
  VP8SSE16x16 = SSE16x16;
  VP8SSE8x8 = SSE8x8;
  VP8SSE16x8 = SSE16x8;
  VP8SSE4x4 = SSE4x4;
 #endif
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -32,9 +32,9 @@ static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.

 // This code works but is *slower* than the inlined-asm version below
 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
-// USE_INTRINSICS define.
+// WEBP_USE_INTRINSICS define.
 // With gcc-4.8, it's a little faster speed than inlined-assembly.
-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)

 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
@ -241,7 +241,7 @@ static void ITransformOne(const uint8_t* ref,
  );
 }

-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS

 static void ITransform(const uint8_t* ref,
                       const int16_t* in, uint8_t* dst, int do_two) {
@ -263,7 +263,7 @@ static uint8x16_t Load4x4(const uint8_t* src) {

 // Forward transform.

-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)

 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
                                         const int16x4_t C, const int16x4_t D,
@ -548,324 +548,166 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.

-// This code works but is *slower* than the inlined-asm version below
-// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
-// USE_INTRINSICS define.
-// With gcc-4.8, it's only slightly slower than the inlined.
-#if defined(USE_INTRINSICS)
+// a 0123, b 0123
+// a 4567, b 4567
+// a 89ab, b 89ab
+// a cdef, b cdef
+//
+// transpose
+//
+// a 048c, b 048c
+// a 159d, b 159d
+// a 26ae, b 26ae
+// a 37bf, b 37bf
+//
+static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
+  const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
+  const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
+  const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
+                                        vreinterpret_u16_u8(d2_tmp1.val[0]));
+  const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
+                                        vreinterpret_u16_u8(d2_tmp1.val[1]));

-// Zero extend an uint16x4_t 'v' to an int32x4_t.
-static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
-  return vreinterpretq_s32_u32(vmovl_u16(v));
+  d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
+  d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
+  d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
+  d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
+  return d4_in;
 }

-// Does a regular 4x4 transpose followed by an adjustment of the upper columns
-// in the inner rows to restore the source order of differences,
-// i.e., a0 - a1 | a3 - a2.
-static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
-  int32x4x4_t out = Transpose4x4(rows);
-  // restore source order in the columns containing differences.
-  const int32x2_t r1h = vget_high_s32(out.val[1]);
-  const int32x2_t r2h = vget_high_s32(out.val[2]);
-  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
-  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
-  return out;
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
+  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
+  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
+  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
+  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
+  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
+  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
+  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
+  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
+  return q4_in;
 }

-static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
-                                                   const uint8x8_t r2r3) {
-  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
-  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
-  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
-  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
-  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
-  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
-  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
-  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
-  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
-  const int16x8x2_t transpose =
-      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
-  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
-  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
-                                      vget_low_s16(transpose.val[1]));
-  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
-                                      vget_high_s16(transpose.val[1]));
-  // [0]: tmp[3] [1]: tmp[2]
-  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
-  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
-  return res;
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
+  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
+                                                        d4_in.val[2]));
+  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
+                                                        d4_in.val[3]));
+  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
+                                                        d4_in.val[2]));
+  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
+                                                        d4_in.val[3]));
+  int16x8x4_t q4_out;
+  // tmp[0] = a0 + a1
+  // tmp[1] = a3 + a2
+  // tmp[2] = a3 - a2
+  // tmp[3] = a0 - a1
+  INIT_VECTOR4(q4_out,
+               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
+               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+  return q4_out;
 }

-static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
-  // a0 = tmp[0 + i] + tmp[8 + i];
-  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
-  // a1 = tmp[4 + i] + tmp[12+ i];
-  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
-  // a2 = tmp[4 + i] - tmp[12+ i];
-  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
-  // a3 = tmp[0 + i] - tmp[8 + i];
-  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
-  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
-  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
-  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
-  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
-  const int32x4x4_t res = { { b0, b1, b2, b3 } };
-  return res;
+static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
+  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+
+  q4_in.val[0] = vaddq_s16(q_a0, q_a1);
+  q4_in.val[1] = vaddq_s16(q_a3, q_a2);
+  q4_in.val[2] = vabdq_s16(q_a3, q_a2);
+  q4_in.val[3] = vabdq_s16(q_a0, q_a1);
+  q4_in.val[0] = vabsq_s16(q4_in.val[0]);
+  q4_in.val[1] = vabsq_s16(q4_in.val[1]);
+  return q4_in;
 }

-// Calculate the weighted sum of the rows in 'b'.
-static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
-                                      const int32x4_t w0, const int32x4_t w1,
-                                      const int32x4_t w2, const int32x4_t w3) {
-  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
-  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
-  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
-  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
-  const int64x2_t sum1 = vpaddlq_s32(s3);
-  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
-  return sum2;
+static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
+  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
+  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
+  int16x4x4_t d4_w;
+  INIT_VECTOR4(d4_w,
+               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
+  return d4_w;
+}
+
+static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
+                                      const int16x4x4_t d4_w) {
+  int32x2_t d_sum;
+  // sum += w[ 0] * abs(b0);
+  // sum += w[ 4] * abs(b1);
+  // sum += w[ 8] * abs(b2);
+  // sum += w[12] * abs(b3);
+  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
+  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
+  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
+  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
+  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
+  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
+  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
+  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
+
+  q_sum0 = vaddq_s32(q_sum0, q_sum1);
+  q_sum2 = vaddq_s32(q_sum2, q_sum3);
+  q_sum2 = vaddq_s32(q_sum0, q_sum2);
+  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
+  d_sum = vpadd_s32(d_sum, d_sum);
+  return d_sum;
 }

 #define LOAD_LANE_32b(src, VALUE, LANE) \
-    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
+    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))

 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
-  uint32x4_t d0d1 = { 0, 0, 0, 0 };
-  uint32x4_t d2d3 = { 0, 0, 0, 0 };
-  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
-  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
-  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
-  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
-  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
-  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
-  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
-  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
+  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
+  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
+  uint8x8x4_t d4_in;
+
+  // load data a, b
+  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
+  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
+  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
+  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
+  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
+  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
+  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
+  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
+  INIT_VECTOR4(d4_in,
+               vreinterpret_u8_u32(d_in_ab_0123),
+               vreinterpret_u8_u32(d_in_ab_4567),
+               vreinterpret_u8_u32(d_in_ab_89ab),
+               vreinterpret_u8_u32(d_in_ab_cdef));

  {
-    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
-    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
-    const uint16x8x2_t tmp =
-        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
-    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
-    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
-    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
-                                                    vget_low_u8(d2d3u8));
-    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
-                                                    vget_high_u8(d2d3u8));
-    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
-    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
-    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
-    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
-    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
-    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
-    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
-    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
-    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
-    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
-    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
-                                    vreinterpret_s32_s64(sum2));
-    const int32x2_t res = vshr_n_s32(diff, 5);
-    return vget_lane_s32(res, 0);
+    // horizontal pass
+    const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
+    const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
+    const int16x4x4_t d4_w = DistoLoadW(w);
+    // vertical pass
+    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
+    const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
+    int32x2_t d_sum = DistoSum(q4_v, d4_w);
+
+    // abs(sum2 - sum1) >> 5
+    d_sum = vabs_s32(d_sum);
+    d_sum  = vshr_n_s32(d_sum, 5);
+    return vget_lane_s32(d_sum, 0);
  }
 }
-
 #undef LOAD_LANE_32b

-#else
-
-// Hadamard transform
-// Returns the weighted sum of the absolute value of transformed coefficients.
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int kBPS = BPS;
-  const uint8_t* A = a;
-  const uint8_t* B = b;
-  const uint16_t* W = w;
-  int sum;
-  __asm__ volatile (
-    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
-    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
-    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
-    "vld1.32         d2[1], [%[a]]            \n"
-
-    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
-    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
-    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
-    "vld1.32         d3[1], [%[b]]            \n"
-
-    // a d0/d2, b d1/d3
-    // d0/d1: 01 01 01 01
-    // d2/d3: 23 23 23 23
-    // But: it goes 01 45 23 67
-    // Notice the middle values are transposed
-    "vtrn.16         q0, q1                   \n"
-
-    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
-    "vaddl.u8        q2, d0, d2               \n"
-    "vaddl.u8        q10, d1, d3              \n"
-    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
-    "vsubl.u8        q3, d0, d2               \n"
-    "vsubl.u8        q11, d1, d3              \n"
-
-    // tmp[0] = a0 + a1
-    "vpaddl.s16      q0, q2                   \n"
-    "vpaddl.s16      q8, q10                  \n"
-
-    // tmp[1] = a3 + a2
-    "vpaddl.s16      q1, q3                   \n"
-    "vpaddl.s16      q9, q11                  \n"
-
-    // No pair subtract
-    // q2 = {a0, a3}
-    // q3 = {a1, a2}
-    "vtrn.16         q2, q3                   \n"
-    "vtrn.16         q10, q11                 \n"
-
-    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
-    "vsubl.s16       q12, d4, d6              \n"
-    "vsubl.s16       q13, d5, d7              \n"
-    "vsubl.s16       q14, d20, d22            \n"
-    "vsubl.s16       q15, d21, d23            \n"
-
-    // separate tmp[3] and tmp[2]
-    // q12 = tmp[3]
-    // q13 = tmp[2]
-    "vtrn.32         q12, q13                 \n"
-    "vtrn.32         q14, q15                 \n"
-
-    // Transpose tmp for a
-    "vswp            d1, d26                  \n" // vtrn.64
-    "vswp            d3, d24                  \n" // vtrn.64
-    "vtrn.32         q0, q1                   \n"
-    "vtrn.32         q13, q12                 \n"
-
-    // Transpose tmp for b
-    "vswp            d17, d30                 \n" // vtrn.64
-    "vswp            d19, d28                 \n" // vtrn.64
-    "vtrn.32         q8, q9                   \n"
-    "vtrn.32         q15, q14                 \n"
-
-    // The first Q register is a, the second b.
-    // q0/8 tmp[0-3]
-    // q13/15 tmp[4-7]
-    // q1/9 tmp[8-11]
-    // q12/14 tmp[12-15]
-
-    // These are still in 01 45 23 67 order. We fix it easily in the addition
-    // case but the subtraction propagates them.
-    "vswp            d3, d27                  \n"
-    "vswp            d19, d31                 \n"
-
-    // a0 = tmp[0] + tmp[8]
-    "vadd.s32        q2, q0, q1               \n"
-    "vadd.s32        q3, q8, q9               \n"
-
-    // a1 = tmp[4] + tmp[12]
-    "vadd.s32        q10, q13, q12            \n"
-    "vadd.s32        q11, q15, q14            \n"
-
-    // a2 = tmp[4] - tmp[12]
-    "vsub.s32        q13, q13, q12            \n"
-    "vsub.s32        q15, q15, q14            \n"
-
-    // a3 = tmp[0] - tmp[8]
-    "vsub.s32        q0, q0, q1               \n"
-    "vsub.s32        q8, q8, q9               \n"
-
-    // b0 = a0 + a1
-    "vadd.s32        q1, q2, q10              \n"
-    "vadd.s32        q9, q3, q11              \n"
-
-    // b1 = a3 + a2
-    "vadd.s32        q12, q0, q13             \n"
-    "vadd.s32        q14, q8, q15             \n"
-
-    // b2 = a3 - a2
-    "vsub.s32        q0, q0, q13              \n"
-    "vsub.s32        q8, q8, q15              \n"
-
-    // b3 = a0 - a1
-    "vsub.s32        q2, q2, q10              \n"
-    "vsub.s32        q3, q3, q11              \n"
-
-    "vld1.64         {q10, q11}, [%[w]]       \n"
-
-    // abs(b0)
-    "vabs.s32        q1, q1                   \n"
-    "vabs.s32        q9, q9                   \n"
-    // abs(b1)
-    "vabs.s32        q12, q12                 \n"
-    "vabs.s32        q14, q14                 \n"
-    // abs(b2)
-    "vabs.s32        q0, q0                   \n"
-    "vabs.s32        q8, q8                   \n"
-    // abs(b3)
-    "vabs.s32        q2, q2                   \n"
-    "vabs.s32        q3, q3                   \n"
-
-    // expand w before using.
-    "vmovl.u16       q13, d20                 \n"
-    "vmovl.u16       q15, d21                 \n"
-
-    // w[0] * abs(b0)
-    "vmul.u32        q1, q1, q13              \n"
-    "vmul.u32        q9, q9, q13              \n"
-
-    // w[4] * abs(b1)
-    "vmla.u32        q1, q12, q15             \n"
-    "vmla.u32        q9, q14, q15             \n"
-
-    // expand w before using.
-    "vmovl.u16       q13, d22                 \n"
-    "vmovl.u16       q15, d23                 \n"
-
-    // w[8] * abs(b1)
-    "vmla.u32        q1, q0, q13              \n"
-    "vmla.u32        q9, q8, q13              \n"
-
-    // w[12] * abs(b1)
-    "vmla.u32        q1, q2, q15              \n"
-    "vmla.u32        q9, q3, q15              \n"
-
-    // Sum the arrays
-    "vpaddl.u32      q1, q1                   \n"
-    "vpaddl.u32      q9, q9                   \n"
-    "vadd.u64        d2, d3                   \n"
-    "vadd.u64        d18, d19                 \n"
-
-    // Hadamard transform needs 4 bits of extra precision (2 bits in each
-    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
-    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
-    // 16bits for w[] + 2 bits of abs() summation.
-    //
-    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
-    // A-OK.
-
-    // sum2 - sum1
-    "vsub.u32        d0, d2, d18              \n"
-    // abs(sum2 - sum1)
-    "vabs.s32        d0, d0                   \n"
-    // abs(sum2 - sum1) >> 5
-    "vshr.u32        d0, #5                   \n"
-
-    // It would be better to move the value straight into r0 but I'm not
-    // entirely sure how this works with inline assembly.
-    "vmov.32         %[sum], d0[0]            \n"
-
-    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
-    : [kBPS] "r"(kBPS)
-    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
-  ) ;
-
-  return sum;
-}
-
-#endif  // USE_INTRINSICS
-
 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                      const uint16_t* const w) {
  int D = 0;
@ -885,6 +727,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             VP8Histogram* const histo) {
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int16_t out[16];
    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
@ -902,10 +745,11 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
      // Convert coefficients to bin.
      for (k = 0; k < 16; ++k) {
-        histo->distribution[out[k]]++;
+        ++distribution[out[k]];
      }
    }
  }
+  VP8SetHistogramData(distribution, histo);
 }

 //------------------------------------------------------------------------------
@ -1049,17 +893,22 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return 0;
 }

-#endif   // !WORK_AROUND_GCC
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}

-#endif   // WEBP_USE_NEON
+#endif   // !WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8EncDspInitNEON(void);

-void VP8EncDspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;

@ -1074,6 +923,12 @@ void VP8EncDspInitNEON(void) {
  VP8SSE4x4 = SSE4x4;
 #if !defined(WORK_AROUND_GCC)
  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
 #endif
-#endif   // WEBP_USE_NEON
 }
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
--- a/src/dsp/enc_sse41.c
+++ b/src/dsp/enc_sse41.c
@ -0,0 +1,373 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+#include <stdlib.h>  // for abs()
+
+#include "../enc/vp8enci.h"
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms.
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // v = abs(out) >> 3
+      const __m128i abs0 = _mm_abs_epi16(out0);
+      const __m128i abs1 = _mm_abs_epi16(out1);
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the difference between the weighted sum of the absolute value of
+// transformed coefficients.
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+                      const uint16_t* const w) {
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+
+  // Load, combine and transpose inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
+    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
+    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
+    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
+    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
+
+    // Transpose the two 4x4, discarding the filling zeroes.
+    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
+    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
+    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
+    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
+    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
+
+    // Convert to 16b.
+    tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
+    tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
+    tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
+    tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Vertical pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    A_b0 = _mm_abs_epi16(A_b0);
+    A_b2 = _mm_abs_epi16(A_b2);
+    B_b0 = _mm_abs_epi16(B_b0);
+    B_b2 = _mm_abs_epi16(B_b2);
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b2 = _mm_sub_epi32(A_b0, B_b0);
+    // cascading summation of the differences
+    B_b0 = _mm_hadd_epi32(A_b2, A_b2);
+    B_b2 = _mm_hadd_epi32(B_b0, B_b0);
+    return _mm_cvtsi128_si32(B_b2);
+  }
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int diff_sum = TTransform(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Generates a pshufb constant for shuffling 16b words.
+#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
+  _mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
+               2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
+               2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
+               2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
+
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
+                                       const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+  // coeff = abs(in)
+  __m128i coeff0 = _mm_abs_epi16(in0);
+  __m128i coeff8 = _mm_abs_epi16(in8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // put sign back
+  out0 = _mm_sign_epi16(out0, in0);
+  out8 = _mm_sign_epi16(out8, in8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it. The re-ordering is:
+  //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+  // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+  // There's only two misplaced entries ([8] and [7]) that are crossing the
+  // reg's boundaries.
+  // We use pshufb instead of pshuflo/pshufhi.
+  {
+    const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
+    const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
+    const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
+    const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
+    const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
+    const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
+    const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
+    const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
+    const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
+    const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
+    _mm_storeu_si128((__m128i*)&out[0], out_z0);
+    _mm_storeu_si128((__m128i*)&out[8], out_z8);
+    packed_out = _mm_packs_epi16(out_z0, out_z8);
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+#undef PSHUFB_CST
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE41(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
+  VP8CollectHistogram = CollectHistogram;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/src/dsp/filters.c
+++ b/src/dsp/filters.c
@ -0,0 +1,240 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author: Urvang (urvang@google.com)
+
+#include "./dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
+                                    uint8_t* dst, int length, int inverse) {
+  int i;
+  if (inverse) {
+    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLine(in, preds, out, width, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    int w;
+    // leftmost pixel: predict from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
+    for (w = 1; w < width; ++w) {
+      const int pred = GradientPredictor(preds[w - 1],
+                                         preds[w - stride],
+                                         preds[w - stride - 1]);
+      out[w] = in[w] + (inverse ? pred : -pred);
+    }
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+//------------------------------------------------------------------------------
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Init function
+
+WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+extern void VP8FiltersInitMIPSdspR2(void);
+extern void VP8FiltersInitSSE2(void);
+
+static volatile VP8CPUInfo filters_last_cpuinfo_used =
+    (VP8CPUInfo)&filters_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
+  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+
+  WebPFilters[WEBP_FILTER_NONE] = NULL;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8FiltersInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8FiltersInitMIPSdspR2();
+    }
+#endif
+  }
+  filters_last_cpuinfo_used = VP8GetCPUInfo;
+}
--- a/src/dsp/filters_mips_dsp_r2.c
+++ b/src/dsp/filters_mips_dsp_r2.c
@ -0,0 +1,405 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+// if INVERSE
+//   preds == &dst[-1] == &src[-1]
+// else
+//   preds == &src[-1] != &dst[-1]
+#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6;                       \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    0x2         \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp6],    %[length],    0x3         \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp3],    1(%[src])                 \n\t"                  \
+      "lbu       %[temp4],    2(%[src])                 \n\t"                  \
+      "lbu       %[temp5],    3(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "addu      %[temp2],    %[temp2],     %[temp1]    \n\t"                  \
+      "addu      %[temp3],    %[temp3],     %[temp2]    \n\t"                  \
+      "addu      %[temp4],    %[temp4],     %[temp3]    \n\t"                  \
+      "addu      %[temp1],    %[temp5],     %[temp4]    \n\t"                  \
+      "sb        %[temp2],    -4(%[src])                \n\t"                  \
+      "sb        %[temp3],    -3(%[src])                \n\t"                  \
+      "sb        %[temp4],    -2(%[src])                \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " sb       %[temp1],    -1(%[src])                \n\t"                  \
+    ".else                                              \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    -1(%[src])                \n\t"                  \
+      "ulw       %[temp2],    0(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "subu.qb   %[temp3],    %[temp2],     %[temp1]    \n\t"                  \
+      "usw       %[temp3],    0(%[dst])                 \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
+    ".endif                                             \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp6],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+      "sb        %[temp3],    -1(%[src])                \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[temp6],    %[temp6],     -1          \n\t"                  \
+      "bnez      %[temp6],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc)                \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
+                                    int length, int inverse) {
+  if (inverse) {
+    DO_PREDICT_LINE(src, dst, length, 1);
+  } else {
+    DO_PREDICT_LINE(src, dst, length, 0);
+  }
+}
+
+#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do {         \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    const uint8_t* ppred = (uint8_t*)(PRED);                                   \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;                \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    0x3         \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp7],    %[length],    0x7         \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    0(%[src])                 \n\t"                  \
+      "ulw       %[temp2],    0(%[pred])                \n\t"                  \
+      "ulw       %[temp3],    4(%[src])                 \n\t"                  \
+      "ulw       %[temp4],    4(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       8           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "subu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[pred],     %[pred],      8           \n\t"                  \
+      "usw       %[temp5],    0(%[dst])                 \n\t"                  \
+      "usw       %[temp6],    4(%[dst])                 \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       8           \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp7],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp1],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp2],    0(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "addiu     %[pred],     %[pred],      1           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+      "addiu     %[temp7],    %[temp7],     -1          \n\t"                  \
+      "bnez      %[temp7],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred),          \
+        [dst]"+&r"(pdst), [src]"+&r"(psrc)                                     \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST, INVERSE) do {                    \
+    int temp1, temp2, temp3;                                                   \
+    __asm__ volatile (                                                         \
+      "lbu       %[temp1],   0(%[src])               \n\t"                     \
+      "lbu       %[temp2],   0(%[pred])              \n\t"                     \
+    ".if " #INVERSE "                                \n\t"                     \
+      "addu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
+    ".else                                           \n\t"                     \
+      "subu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
+    ".endif                                          \n\t"                     \
+      "sb        %[temp3],   0(%[dst])               \n\t"                     \
+      : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)          \
+      : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC))                    \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+#define FILTER_LINE_BY_LINE(INVERSE) do {                                      \
+    while (row < last_row) {                                                   \
+      PREDICT_LINE_ONE_PASS(in, preds - stride, out, INVERSE);                 \
+      DO_PREDICT_LINE(in + 1, out + 1, width - 1, INVERSE);                    \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  if (inverse) {
+    FILTER_LINE_BY_LINE(1);
+  } else {
+    FILTER_LINE_BY_LINE(0);
+  }
+}
+
+#undef FILTER_LINE_BY_LINE
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+#define FILTER_LINE_BY_LINE(INVERSE) do {                                      \
+    while (row < last_row) {                                                   \
+      DO_PREDICT_LINE_VERTICAL(in, preds, out, width, INVERSE);                \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  if (inverse) {
+    FILTER_LINE_BY_LINE(1);
+  } else {
+    FILTER_LINE_BY_LINE(0);
+  }
+}
+
+#undef FILTER_LINE_BY_LINE
+#undef DO_PREDICT_LINE_VERTICAL
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+  int temp0;
+  __asm__ volatile (
+    "addu             %[temp0],   %[a],       %[b]        \n\t"
+    "subu             %[temp0],   %[temp0],   %[c]        \n\t"
+    "shll_s.w         %[temp0],   %[temp0],   23          \n\t"
+    "precrqu_s.qb.ph  %[temp0],   %[temp0],   $zero       \n\t"
+    "srl              %[temp0],   %[temp0],   24          \n\t"
+    : [temp0]"=&r"(temp0)
+    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+  );
+  return temp0;
+}
+
+#define FILTER_LINE_BY_LINE(INVERSE, PREDS, OPERATION) do {                    \
+    while (row < last_row) {                                                   \
+      int w;                                                                   \
+      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out, INVERSE);                 \
+      for (w = 1; w < width; ++w) {                                            \
+        const int pred = GradientPredictor(PREDS[w - 1],                       \
+                                           PREDS[w - stride],                  \
+                                           PREDS[w - stride - 1]);             \
+        out[w] = in[w] OPERATION pred;                                         \
+      }                                                                        \
+      ++row;                                                                   \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  if (inverse) {
+    FILTER_LINE_BY_LINE(1, out, +);
+  } else {
+    FILTER_LINE_BY_LINE(0, in, -);
+  }
+}
+
+#undef FILTER_LINE_BY_LINE
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+#undef PREDICT_LINE_ONE_PASS
+#undef DO_PREDICT_LINE
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/filters_sse2.c
+++ b/src/dsp/filters_sse2.c
@ -0,0 +1,352 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of alpha filters
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
+                           uint8_t* dst, int length, int inverse) {
+  int i;
+  const int max_pos = length & ~31;
+  assert(length >= 0);
+  if (inverse) {
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
+      const __m128i C0 = _mm_add_epi8(A0, B0);
+      const __m128i C1 = _mm_add_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
+      _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
+    }
+    for (; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
+      const __m128i C0 = _mm_sub_epi8(A0, B0);
+      const __m128i C1 = _mm_sub_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
+      _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
+    }
+    for (; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
+static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
+                            int inverse) {
+  int i;
+  if (length <= 0) return;
+  if (inverse) {
+    const int max_pos = length & ~7;
+    __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
+    for (i = 0; i < max_pos; i += 8) {
+      const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
+      const __m128i A1 = _mm_add_epi8(A0, last);
+      const __m128i A2 = _mm_slli_si128(A1, 1);
+      const __m128i A3 = _mm_add_epi8(A1, A2);
+      const __m128i A4 = _mm_slli_si128(A3, 2);
+      const __m128i A5 = _mm_add_epi8(A3, A4);
+      const __m128i A6 = _mm_slli_si128(A5, 4);
+      const __m128i A7 = _mm_add_epi8(A5, A6);
+      _mm_storel_epi64((__m128i*)(dst + i), A7);
+      last = _mm_srli_epi64(A7, 56);
+    }
+    for (; i < length; ++i) dst[i] = src[i] + dst[i - 1];
+  } else {
+    const int max_pos = length & ~31;
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i +  0    ));
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i +  0 - 1));
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16    ));
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
+      const __m128i C0 = _mm_sub_epi8(A0, B0);
+      const __m128i C1 = _mm_sub_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)(dst + i +  0), C0);
+      _mm_storeu_si128((__m128i*)(dst + i + 16), C1);
+    }
+    for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
+  }
+}
+
+static void PredictLineC(const uint8_t* src, const uint8_t* pred,
+                         uint8_t* dst, int length, int inverse) {
+  int i;
+  if (inverse) {
+    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLineC(in, preds - stride, out, 1, inverse);
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLineTop(in, preds, out, width, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static void GradientPredictDirect(const uint8_t* const row,
+                                  const uint8_t* const top,
+                                  uint8_t* const out, int length) {
+  const int max_pos = length & ~7;
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i < max_pos; i += 8) {
+    const __m128i A0 = _mm_loadl_epi64((const __m128i*)&row[i - 1]);
+    const __m128i B0 = _mm_loadl_epi64((const __m128i*)&top[i]);
+    const __m128i C0 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
+    const __m128i D = _mm_loadl_epi64((const __m128i*)&row[i]);
+    const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
+    const __m128i B1 = _mm_unpacklo_epi8(B0, zero);
+    const __m128i C1 = _mm_unpacklo_epi8(C0, zero);
+    const __m128i E = _mm_add_epi16(A1, B1);
+    const __m128i F = _mm_sub_epi16(E, C1);
+    const __m128i G = _mm_packus_epi16(F, zero);
+    const __m128i H = _mm_sub_epi8(D, G);
+    _mm_storel_epi64((__m128i*)(out + i), H);
+  }
+  for (; i < length; ++i) {
+    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+  }
+}
+
+static void GradientPredictInverse(const uint8_t* const in,
+                                   const uint8_t* const top,
+                                   uint8_t* const row, int length) {
+  if (length > 0) {
+    int i;
+    const int max_pos = length & ~7;
+    const __m128i zero = _mm_setzero_si128();
+    __m128i A = _mm_set_epi32(0, 0, 0, row[-1]);   // left sample
+    for (i = 0; i < max_pos; i += 8) {
+      const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
+      const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
+      const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
+      const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
+      const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]);
+      const __m128i D = _mm_unpacklo_epi8(tmp2, zero);   // base input
+      const __m128i E = _mm_sub_epi16(B, C);  // unclipped gradient basis B - C
+      __m128i out = zero;                     // accumulator for output
+      __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
+      int k = 8;
+      while (1) {
+        const __m128i tmp3 = _mm_add_epi16(A, E);        // delta = A + B - C
+        const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi);
+        const __m128i tmp5 = _mm_max_epi16(tmp4, zero);  // clipped delta
+        const __m128i tmp6 = _mm_add_epi16(tmp5, D);     // add to in[] values
+        A = _mm_and_si128(tmp6, mask_hi);                // 1-complement clip
+        out = _mm_or_si128(out, A);                      // accumulate output
+        if (--k == 0) break;
+        A = _mm_slli_si128(A, 2);                        // rotate left sample
+        mask_hi = _mm_slli_si128(mask_hi, 2);            // rotate mask
+      }
+      A = _mm_srli_si128(A, 14);       // prepare left sample for next iteration
+      _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero));
+    }
+    for (; i < length; ++i) {
+      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    }
+  }
+}
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    if (inverse) {
+      PredictLineC(in, out - stride, out, 1, inverse);  // predict from above
+      GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1);
+    } else {
+      PredictLineC(in, in - stride, out, 1, inverse);
+      GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
+    }
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+//------------------------------------------------------------------------------
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -25,14 +25,17 @@
 extern "C" {
 #endif

+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "../enc/delta_palettization.h"
+#endif  // WEBP_EXPERIMENTAL_FEATURES
+
 //------------------------------------------------------------------------------
-// Signatures and generic function-pointers
+// Decoding

 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];

 typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
-extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;

 typedef struct {
@ -44,39 +47,8 @@ typedef struct {
 } VP8LMultipliers;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
                                       uint32_t* argb_data, int num_pixels);
-extern VP8LTransformColorFunc VP8LTransformColor;
 extern VP8LTransformColorFunc VP8LTransformColorInverse;

-typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
-                                uint8_t* dst);
-extern VP8LConvertFunc VP8LConvertBGRAToRGB;
-extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
-extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
-extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
-extern VP8LConvertFunc VP8LConvertBGRAToBGR;
-
-// Expose some C-only fallback functions
-void VP8LTransformColor_C(const VP8LMultipliers* const m,
-                          uint32_t* data, int num_pixels);
-void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
-                                 uint32_t* data, int num_pixels);
-
-void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
-                                 int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
-                               int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
-void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
-
-// Must be called before calling any of the above methods.
-void VP8LDspInit(void);
-
-//------------------------------------------------------------------------------
-// Image transforms.
-
 struct VP8LTransform;  // Defined in dec/vp8li.h.

 // Performs inverse transform of data given transform information, start and end
@ -87,6 +59,48 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                          int row_start, int row_end,
                          const uint32_t* const in, uint32_t* const out);

+// Color space conversion.
+typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
+                                uint8_t* dst);
+extern VP8LConvertFunc VP8LConvertBGRAToRGB;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
+extern VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+// Converts from BGRA to other color spaces.
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+
+// color mapping related functions.
+static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
+  return (idx >> 8) & 0xff;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
+  return idx;
+}
+
+static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
+  return val;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
+  return (val >> 8) & 0xff;
+}
+
+typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
+                                const uint32_t* const color_map,
+                                uint32_t* dst, int y_start,
+                                int y_end, int width);
+typedef void (*VP8LMapAlphaFunc)(const uint8_t* src,
+                                 const uint32_t* const color_map,
+                                 uint8_t* dst, int y_start,
+                                 int y_end, int width);
+
+extern VP8LMapARGBFunc VP8LMapColor32b;
+extern VP8LMapAlphaFunc VP8LMapColor8b;
+
 // Similar to the static method ColorIndexInverseTransform() that is part of
 // lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
 // uint32_t) arguments for 'src' and 'dst'.
@ -94,20 +108,61 @@ void VP8LColorIndexInverseTransformAlpha(
    const struct VP8LTransform* const transform, int y_start, int y_end,
    const uint8_t* src, uint8_t* dst);

-void VP8LResidualImage(int width, int height, int bits,
+// Expose some C-only fallback functions
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+                                 uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+
+// Must be called before calling any of the above methods.
+void VP8LDspInit(void);
+
+//------------------------------------------------------------------------------
+// Encoding
+
+extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LTransformColorFunc VP8LTransformColor;
+typedef void (*VP8LCollectColorBlueTransformsFunc)(
+    const uint32_t* argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_blue, int red_to_blue, int histo[]);
+extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+
+typedef void (*VP8LCollectColorRedTransformsFunc)(
+    const uint32_t* argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_red, int histo[]);
+extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+// Expose some C-only fallback functions
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+                                     int tile_width, int tile_height,
+                                     int green_to_red, int histo[]);
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_blue, int red_to_blue,
+                                      int histo[]);
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image);
+                       uint32_t* const image, int exact);

 void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                             uint32_t* const argb, uint32_t* image);

-//------------------------------------------------------------------------------
-// Color space conversion.
-
-// Converts from BGRA to other color spaces.
-void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
-                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
-
 //------------------------------------------------------------------------------
 // Misc methods.

@ -119,6 +174,14 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,

 // -----------------------------------------------------------------------------
 // Faster logarithm for integers. Small values use a look-up table.
+
+// The threshold till approximate version of log_2 can be used.
+// Practically, we can get rid of the call to log() as the two values match to
+// very high degree (the ratio of these two is 0.99999x).
+// Keeping a high threshold for now.
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
@ -141,23 +204,56 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
 typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
 typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
                                       int length);
+typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
+                                                const int Y[256]);

 extern VP8LCostFunc VP8LExtraCost;
 extern VP8LCostCombinedFunc VP8LExtraCostCombined;
+extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;

 typedef struct {        // small struct to hold counters
  int counts[2];        // index: 0=zero steak, 1=non-zero streak
  int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;

-typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
-                                         int length);
 typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
                                                 const uint32_t* Y, int length);

-extern VP8LCostCountFunc VP8LHuffmanCostCount;
 extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;

+typedef struct {            // small struct to hold bit entropy results
+  double entropy;           // entropy
+  uint32_t sum;             // sum of the population
+  int nonzeros;             // number of non-zero elements in the population
+  uint32_t max_val;         // maximum value in the population
+  uint32_t nonzero_code;    // index of the last non-zero in the population
+} VP8LBitEntropy;
+
+void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
+
+// Get the combined symbol bit entropy and Huffman cost stats for the
+// distributions 'X' and 'Y'. Those results can then be refined according to
+// codec specific heuristics.
+void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
+                                     const uint32_t* const Y, int length,
+                                     VP8LBitEntropy* const bit_entropy,
+                                     VP8LStreaks* const stats);
+// Get the entropy for the distribution 'X'.
+void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
+                             VP8LBitEntropy* const bit_entropy,
+                             VP8LStreaks* const stats);
+
+void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+                              VP8LBitEntropy* const entropy);
+
+typedef void (*GetEntropyUnrefinedHelperFunc)(uint32_t val, int i,
+                                              uint32_t* const val_prev,
+                                              int* const i_prev,
+                                              VP8LBitEntropy* const bit_entropy,
+                                              VP8LStreaks* const stats);
+// Internal function used by VP8LGet*EntropyUnrefined.
+extern GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
+
 typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
                                     const VP8LHistogram* const b,
                                     VP8LHistogram* const out);
@ -240,6 +336,9 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                        int xbits, uint32_t* const dst);

+// Must be called before calling any of the above methods.
+void VP8LEncDspInit(void);
+
 //------------------------------------------------------------------------------

 #ifdef __cplusplus
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
--- a/src/dsp/lossless_enc_mips32.c
+++ b/src/dsp/lossless_enc_mips32.c
@ -1,4 +1,4 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
+// Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
@ -22,10 +22,6 @@
 #include <stdlib.h>
 #include <string.h>

-#define APPROX_LOG_WITH_CORRECTION_MAX  65536
-#define APPROX_LOG_MAX                   4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-
 static float FastSLog2Slow(uint32_t v) {
  assert(v >= LOG_LOOKUP_IDX_MAX);
  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
@ -217,58 +213,31 @@ static double ExtraCostCombined(const uint32_t* const X,
  );

 // Returns the various RLE counts
-static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
-  int i;
-  int streak = 0;
-  VP8LStreaks stats;
-  int* const pstreaks = &stats.streaks[0][0];
-  int* const pcnts = &stats.counts[0];
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  int* const pstreaks = &stats->streaks[0][0];
+  int* const pcnts = &stats->counts[0];
  int temp0, temp1, temp2, temp3;
-  memset(&stats, 0, sizeof(stats));
-  for (i = 0; i < length - 1; ++i) {
-    ++streak;
-    if (population[i] == population[i + 1]) {
-      continue;
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
    }
-    temp0 = (population[i] != 0);
-    HUFFMAN_COST_PASS
-    streak = 0;
  }
-  ++streak;
-  temp0 = (population[i] != 0);
+
+  // Gather info for the Huffman cost.
+  temp0 = (*val_prev != 0);
  HUFFMAN_COST_PASS

-  return stats;
-}
-
-static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
-                                            const uint32_t* Y, int length) {
-  int i;
-  int streak = 0;
-  VP8LStreaks stats;
-  int* const pstreaks = &stats.streaks[0][0];
-  int* const pcnts = &stats.counts[0];
-  int temp0, temp1, temp2, temp3;
-  memset(&stats, 0, sizeof(stats));
-  for (i = 0; i < length - 1; ++i) {
-    const uint32_t xy = X[i] + Y[i];
-    const uint32_t xy_next = X[i + 1] + Y[i + 1];
-    ++streak;
-    if (xy == xy_next) {
-      continue;
-    }
-    temp0 = (xy != 0);
-    HUFFMAN_COST_PASS
-    streak = 0;
-  }
-  {
-    const uint32_t xy = X[i] + Y[i];
-    ++streak;
-    temp0 = (xy != 0);
-    HUFFMAN_COST_PASS
-  }
-
-  return stats;
+  *val_prev = val;
+  *i_prev = i;
 }

 #define ASM_START                                       \
@ -285,28 +254,28 @@ static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
 // literal_ and successive histograms could be unaligned
 // so we must use ulw and usw
 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
-    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
-    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
-    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
-    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
-    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
-    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
-    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
-    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
+    "ulw    %[temp0], " #A "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp1], " #B "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp2], " #C "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp3], " #D "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp4], " #A "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp5], " #B "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp6], " #C "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp7], " #D "(%[" #P1 "])    \n\t"       \
    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
-    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
-  ".if "#E" == 1                            \n\t"       \
-    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
+    "addiu  %[" #P0 "],  %[" #P0 "],  16    \n\t"       \
+  ".if " #E " == 1                          \n\t"       \
+    "addiu  %[" #P1 "],  %[" #P1 "],  16    \n\t"       \
  ".endif                                   \n\t"       \
-    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
-    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
-    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
-    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
-    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
-    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
+    "usw    %[temp4], " #A "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp5], " #B "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp6], " #C "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp7], " #D "(%[" #P2 "])    \n\t"       \
+    "addiu  %[" #P2 "], %[" #P2 "],   16    \n\t"       \
+    "bne    %[" #P0 "], %[LoopEnd], 1b      \n\t"       \
    ".set   pop                             \n\t"       \

 #define ASM_END_COMMON_0                                \
@ -396,21 +365,22 @@ static void HistogramAdd(const VP8LHistogram* const a,
 #undef ADD_TO_OUT
 #undef ASM_START

-#endif  // WEBP_USE_MIPS32
-
 //------------------------------------------------------------------------------
 // Entry point

-extern void VP8LDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPS32(void);

-void VP8LDspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
  VP8LFastSLog2Slow = FastSLog2Slow;
  VP8LFastLog2Slow = FastLog2Slow;
  VP8LExtraCost = ExtraCost;
  VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LHuffmanCostCount = HuffmanCostCount;
-  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
+  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
  VP8LHistogramAdd = HistogramAdd;
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/src/dsp/lossless_enc_mips_dsp_r2.c
+++ b/src/dsp/lossless_enc_mips_dsp_r2.c
@ -0,0 +1,275 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./lossless.h"
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
+                                        int num_pixels) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
+  uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[argb_data],    %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "lw         %[temp1],        4(%[argb_data])              \n\t"
+    "lw         %[temp2],        8(%[argb_data])              \n\t"
+    "lw         %[temp3],        12(%[argb_data])             \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[argb_data],    %[argb_data],       16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "subu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "subu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "subu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[argb_data])            \n\t"
+    "sw         %[temp1],        -12(%[argb_data])            \n\t"
+    "sw         %[temp2],        -8(%[argb_data])             \n\t"
+    "bne        %[argb_data],    %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[argb_data])             \n\t"
+  "3:                                                         \n\t"
+    "beq        %[argb_data],    %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "addiu      %[argb_data],    %[argb_data],       4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[argb_data],    %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[argb_data])             \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
+                           int num_pixels) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red, new_red1;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[data])                \n\t"
+    "lw              %[argb1],     4(%[data])                \n\t"
+    "lhu             %[new_red],   2(%[data])                \n\t"
+    "lhu             %[new_red1],  6(%[data])                \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "precr.qb.ph     %[temp4],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "preceu.ph.qbla  %[temp4],     %[temp4]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shll.ph         %[temp4],     %[temp4],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "addiu           %[data],      %[data],        8         \n\t"
+    "ins             %[new_red1],  %[new_red],     16,   16  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "subu.ph         %[new_red1],  %[new_red1],    %[temp5]  \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red1]               \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "sb              %[temp5],     -6(%[data])               \n\t"
+    "bne             %[data],      %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[data])               \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
+      [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  if (num_pixels & 1) {
+    const uint32_t argb_ = data[0];
+    const uint32_t green = argb_ >> 8;
+    const uint32_t red = argb_ >> 16;
+    uint32_t new_blue = argb_;
+    new_red = red;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint8_t new_blue = argb;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
+                                       int tile_width, int tile_height,
+                                       int green_to_blue, int red_to_blue,
+                                       int histo[]) {
+  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
+  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
+  const uint32_t mask = 0xff00ffu;
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precr.qb.ph  %[temp2],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp2],  %[temp2],  8             \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp5],  %[temp2],  %[rtb]        \n\t"
+        "mul.ph       %[temp6],  %[temp3],  %[gtb]        \n\t"
+        "and          %[temp4],  %[temp1],  %[mask]       \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp5],  %[temp5],  5             \n\t"
+        "shra.ph      %[temp6],  %[temp6],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp5]      \n\t"
+        "subu.qb      %[temp2],  %[temp2],  %[temp6]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+          [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
+        : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  uint32_t new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static void CollectColorRedTransforms(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_red, int histo[]) {
+  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precrq.ph.w  %[temp4],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp2],  %[temp3],  %[gtr]        \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp2],  %[temp2],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp2]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+        : [gtr]"r"(gtr)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorRed(green_to_red, *p_argb)];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/lossless_enc_neon.c
+++ b/src/dsp/lossless_enc_neon.c
@ -0,0 +1,143 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "./lossless.h"
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor(const VP8LMultipliers* const m,
+                           uint32_t* argb_data, int num_pixels) {
+  // sign-extended multiplying constants, pre-shifted by 6.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 6)
+  const int16_t rb[8] = {
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_)
+  };
+  const int16x8_t mults_rb = vld1q_s16(rb);
+  const int16_t b2[8] = {
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+  };
+  const int16x8_t mults_b2 = vld1q_s16(b2);
+#undef CST
+#ifdef USE_VTBLQ
+  static const uint8_t kg0g0[16] = {
+    255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
+  };
+  const uint8x16_t shuffle = vld1q_u8(kg0g0);
+#else
+  static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
+  const uint8x8_t shuffle = vld1_u8(k0g0g);
+#endif
+  const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    // 0 g 0 g
+    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    // x dr  x db1
+    const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
+    // r 0   b   0
+    const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
+    // x db2 0   0
+    const int16x8_t C = vqdmulhq_s16(B, mults_b2);
+    // 0 0   x db2
+    const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
+    // x dr  x  db
+    const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
+                                 vreinterpretq_s8_s16(A));
+    // 0 dr  0  db
+    const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
+    const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
+                                   vreinterpretq_s8_u32(F));
+    vst1q_s8((int8_t*)(argb_data + i), out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+#undef USE_VTBLQ
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -0,0 +1,345 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <emmintrin.h>
+#include "./lossless.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)X << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
+    const __m128i out = _mm_sub_epi8(in, C);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor(const VP8LMultipliers* const m,
+                           uint32_t* argb_data, int num_pixels) {
+  const __m128i mults_rb = _mm_set_epi16(
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = _mm_set_epi16(
+      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
+      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
+    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
+    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
+    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
+    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
+    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
+    const __m128i out = _mm_sub_epi8(in, I);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+#define SPAN 8
+static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
+                                       int tile_width, int tile_height,
+                                       int green_to_blue, int red_to_blue,
+                                       int histo[]) {
+  const __m128i mults_r = _mm_set_epi16(
+      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
+      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
+  const __m128i mults_g = _mm_set_epi16(
+      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
+      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_slli_epi16(in0, 8);        // r 0  | b 0
+      const __m128i A1 = _mm_slli_epi16(in1, 8);
+      const __m128i B0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i B1 = _mm_and_si128(in1, mask_g);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_r);  // x db | 0 0
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
+      const __m128i D0 = _mm_mulhi_epi16(B0, mults_g);  // 0 0  | x db
+      const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(in0, D0);         // x x  | x b'
+      const __m128i E1 = _mm_sub_epi8(in1, D1);
+      const __m128i F0 = _mm_srli_epi32(C0, 16);        // 0 0  | x db
+      const __m128i F1 = _mm_srli_epi32(C1, 16);
+      const __m128i G0 = _mm_sub_epi8(E0, F0);          // 0 0  | x b'
+      const __m128i G1 = _mm_sub_epi8(E1, F1);
+      const __m128i H0 = _mm_and_si128(G0, mask_b);     // 0 0  | 0 b
+      const __m128i H1 = _mm_and_si128(G1, mask_b);
+      const __m128i I = _mm_packs_epi32(H0, H1);        // 0 b' | 0 b'
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_red, int histo[]) {
+  const __m128i mults_g = _mm_set_epi16(
+      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
+      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi32(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i A1 = _mm_and_si128(in1, mask_g);
+      const __m128i B0 = _mm_srli_epi32(in0, 16);       // 0 0  | x r
+      const __m128i B1 = _mm_srli_epi32(in1, 16);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_g);  // 0 0  | x dr
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(B0, C0);          // x x  | x r'
+      const __m128i E1 = _mm_sub_epi8(B1, C1);
+      const __m128i F0 = _mm_and_si128(E0, mask);       // 0 0  | 0 r'
+      const __m128i F1 = _mm_and_si128(E1, mask);
+      const __m128i I = _mm_packs_epi32(F0, F1);
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height,
+                                      green_to_red, histo);
+    }
+  }
+}
+#undef SPAN
+
+//------------------------------------------------------------------------------
+
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                      int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+
+static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+#undef LINE_SIZE
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  } else {
+    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  }
+  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entropy
+
+// Checks whether the X or Y contribution is worth computing and adding.
+// Used in loop unrolling.
+#define ANALYZE_X_OR_Y(x_or_y, j)                                   \
+  do {                                                              \
+    if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
+  } while (0)
+
+// Checks whether the X + Y contribution is worth computing and adding.
+// Used in loop unrolling.
+#define ANALYZE_XY(j)                  \
+  do {                                 \
+    if (tmp[j] != 0) {                 \
+      retval -= VP8LFastSLog2(tmp[j]); \
+      ANALYZE_X_OR_Y(X, j);            \
+    }                                  \
+  } while (0)
+
+static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX, sumXY;
+  int32_t tmp[4];
+  __m128i zero = _mm_setzero_si128();
+  // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
+  __m128i sumXY_128 = zero;
+  __m128i sumX_128 = zero;
+
+  for (i = 0; i < 256; i += 4) {
+    const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
+    const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
+
+    // Check if any X is non-zero: this actually provides a speedup as X is
+    // usually sparse.
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
+      const __m128i xy_128 = _mm_add_epi32(x, y);
+      sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
+
+      sumX_128 = _mm_add_epi32(sumX_128, x);
+
+      // Analyze the different X + Y.
+      _mm_storeu_si128((__m128i*)tmp, xy_128);
+
+      ANALYZE_XY(0);
+      ANALYZE_XY(1);
+      ANALYZE_XY(2);
+      ANALYZE_XY(3);
+    } else {
+      // X is fully 0, so only deal with Y.
+      sumXY_128 = _mm_add_epi32(sumXY_128, y);
+
+      ANALYZE_X_OR_Y(Y, 0);
+      ANALYZE_X_OR_Y(Y, 1);
+      ANALYZE_X_OR_Y(Y, 2);
+      ANALYZE_X_OR_Y(Y, 3);
+    }
+  }
+
+  // Sum up sumX_128 to get sumX.
+  _mm_storeu_si128((__m128i*)tmp, sumX_128);
+  sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+
+  // Sum up sumXY_128 to get sumXY.
+  _mm_storeu_si128((__m128i*)tmp, sumXY_128);
+  sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+#undef ANALYZE_X_OR_Y
+#undef ANALYZE_XY
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+  VP8LHistogramAdd = HistogramAdd;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/lossless_enc_sse41.c
+++ b/src/dsp/lossless_enc_sse41.c
@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4.1 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <assert.h>
+#include <smmintrin.h>
+#include "./lossless.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  int i;
+  const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
+                                           -1,  5, -1,  5, -1, 1, -1, 1);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/Show More
+++ b/Show More