vwebp: fix incorrect clipping w/NO_BLEND

when the previous frame does not specify dispose to background only the current frame's rectangle should be cleared related to bug #245 (cherry picked from commit 469ba2cdfd) Change-Id: I2fc4f5be99057e0bf87d8fedec57b06859b070bd
update issue tracker url
2025-07-15 05:19:48 +02:00 · 2015-10-23 13:10:17 -07:00 · 2015-10-20 22:44:52 -07:00 · 2015-10-19 15:41:24 -07:00 · 2015-10-19 15:41:24 -07:00 · 2015-10-19 15:41:23 -07:00
56 changed files with 1861 additions and 1060 deletions
--- a/1
+++ b/1
@ -16,6 +16,7 @@ Contributors:
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Sam Clegg (sbc at chromium dot org)
 - Scott LaVarnway (slavarnway at google dot com)
 - Scott Talbot (s at chikachow dot org)
 - Slobodan Prijic (slobodan dot prijic at imgtec dot com)
--- a/Android.mk
+++ b/Android.mk
@ -10,8 +10,6 @@ ifeq ($(APP_OPTIM),release)
  endif
 endif

-include $(CLEAR_VARS)
-
 ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
  # instructions to be generated for armv7a code. Instead target the neon code
@ -21,7 +19,7 @@ else
  NEON := c
 endif

-LOCAL_SRC_FILES := \
+dec_srcs := \
    src/dec/alpha.c \
    src/dec/buffer.c \
    src/dec/frame.c \
@ -32,6 +30,11 @@ LOCAL_SRC_FILES := \
    src/dec/vp8.c \
    src/dec/vp8l.c \
    src/dec/webp.c \
+
+demux_srcs := \
+    src/demux/demux.c \
+
+dsp_dec_srcs := \
    src/dsp/alpha_processing.c \
    src/dsp/alpha_processing_sse2.c \
    src/dsp/cpu.c \
@ -40,11 +43,6 @@ LOCAL_SRC_FILES := \
    src/dsp/dec_mips32.c \
    src/dsp/dec_neon.$(NEON) \
    src/dsp/dec_sse2.c \
-    src/dsp/enc.c \
-    src/dsp/enc_avx2.c \
-    src/dsp/enc_mips32.c \
-    src/dsp/enc_neon.$(NEON) \
-    src/dsp/enc_sse2.c \
    src/dsp/lossless.c \
    src/dsp/lossless_mips32.c \
    src/dsp/lossless_neon.$(NEON) \
@ -55,6 +53,15 @@ LOCAL_SRC_FILES := \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
    src/dsp/yuv_sse2.c \
+
+dsp_enc_srcs := \
+    src/dsp/enc.c \
+    src/dsp/enc_avx2.c \
+    src/dsp/enc_mips32.c \
+    src/dsp/enc_neon.$(NEON) \
+    src/dsp/enc_sse2.c \
+
+enc_srcs := \
    src/enc/alpha.c \
    src/enc/analysis.c \
    src/enc/backward_references.c \
@ -75,19 +82,38 @@ LOCAL_SRC_FILES := \
    src/enc/tree.c \
    src/enc/vp8l.c \
    src/enc/webpenc.c \
+
+mux_srcs := \
+    src/mux/muxedit.c \
+    src/mux/muxinternal.c \
+    src/mux/muxread.c \
+
+utils_dec_srcs := \
    src/utils/bit_reader.c \
-    src/utils/bit_writer.c \
    src/utils/color_cache.c \
    src/utils/filters.c \
    src/utils/huffman.c \
-    src/utils/huffman_encode.c \
-    src/utils/quant_levels.c \
    src/utils/quant_levels_dec.c \
    src/utils/random.c \
    src/utils/rescaler.c \
    src/utils/thread.c \
    src/utils/utils.c \

+utils_enc_srcs := \
+    src/utils/bit_writer.c \
+    src/utils/huffman_encode.c \
+    src/utils/quant_levels.c \
+
+################################################################################
+# libwebpdecoder
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    $(dec_srcs) \
+    $(dsp_dec_srcs) \
+    $(utils_dec_srcs) \
+
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src

@ -96,6 +122,38 @@ LOCAL_ARM_MODE := arm

 LOCAL_STATIC_LIBRARIES := cpufeatures

+LOCAL_MODULE := webpdecoder_static
+
+include $(BUILD_STATIC_LIBRARY)
+
+ifeq ($(ENABLE_SHARED),1)
+include $(CLEAR_VARS)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := webpdecoder_static
+
+LOCAL_MODULE := webpdecoder
+
+include $(BUILD_SHARED_LIBRARY)
+endif  # ENABLE_SHARED=1
+
+################################################################################
+# libwebp
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    $(dsp_enc_srcs) \
+    $(enc_srcs) \
+    $(utils_enc_srcs) \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
+LOCAL_WHOLE_STATIC_LIBRARIES := webpdecoder_static
+
 LOCAL_MODULE := webp

 ifeq ($(ENABLE_SHARED),1)
@ -104,6 +162,54 @@ else
  include $(BUILD_STATIC_LIBRARY)
 endif

+################################################################################
+# libwebpdemux
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(demux_srcs)
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
+LOCAL_MODULE := webpdemux
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := webp
+  include $(BUILD_SHARED_LIBRARY)
+else
+  LOCAL_STATIC_LIBRARIES := webp
+  include $(BUILD_STATIC_LIBRARY)
+endif
+
+################################################################################
+# libwebpmux
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(mux_srcs)
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
+LOCAL_MODULE := webpmux
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := webp
+  include $(BUILD_SHARED_LIBRARY)
+else
+  LOCAL_STATIC_LIBRARIES := webp
+  include $(BUILD_STATIC_LIBRARY)
+endif
+
+################################################################################
+
 include $(LOCAL_PATH)/examples/Android.mk

 $(call import-module,android/cpufeatures)
--- a/46
+++ b/46
@ -1,3 +1,47 @@
+a661e50 Disable NEON code on Native Client
+fcd94e9 update ChangeLog (tag: v0.4.3-rc1)
+569fe57 update NEWS
+bd852f5 bump version to 0.4.3
+2d58b64 WebPPictureRescale: add a note about 0 width/height
+a0d8ca5 examples/Android.mk: add webpmux_example target
+34b1d29 Android.mk: add webpmux target
+7561988 Android.mk: add webpdemux target
+a987576 Android.mk: add webpdecoder{,_static} targets
+a6d4859 Android.mk: split source lists per-directory
+77544d5 fix iOS arm64 build with Xcode 6.3
+6dea157 doc/webp-container-spec: note MSB order for chunk diagrams
+f7cd57b doc/webp-container-spec: cosmetics
+1d6b250 vwebp: clear canvas at the beginning of each loop
+f97b3f8 webp-container-spec: clarify background clear on loop
+4ba83c1 vwebp: remove unnecessary static Help() prototype
+d34e8e3 vwebp/animation: display last frame on end-of-loop
+bbbc524 dec/vp8: clear 'dither_' on skipped blocks
+0339fa2 lossless_neon: enable subtract green for aarch64
+5a0c220 Regression fix for lossless decoding
+6e3a31d wicdec: (msvs) quiet some /analyze warnings
+b49a578 dwebp/WritePNG: mark png variables volatile
+0a4391a dwebp: include setjmp.h w/WEBP_HAVE_PNG
+90f1ec5 dwebp: correct sign in format strings
+b61ce86 VP8LEncodeStream: add an assert
+df1081b dsp/cpu: (msvs) add include for __cpuidex
+39aa055 dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
+f814f42 dsp/cpu: add include for _xgetbv() w/MSVS
+8508ab9 cpu: fix AVX2 detection for gcc/clang targets
+5769623 fix handling of zero-sized partition #0 corner case
+b2e71a9 make the 'last_cpuinfo_used' variable names unique
+1273e84 add -Wformat-nonliteral and -Wformat-security
+3ae78eb multi-thread fix: lock each entry points with a static var
+5c1eeda webp-container-spec: remove references to fragments
+c5ceea4 enc_neon: fix building with non-Xcode clang (iOS)
+d0859d6 iosbuild: add x64_64 simulator support
+046732c WebPEncode: Support encoding same pic twice (even if modified)
+4426f50 webp/types.h: use inline for clang++/-std=c++11
+e297fc7 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
+855fe43 Makefile.vc: add a 'legacy' RTLIBCFG option
+b7eb6d5 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
+5691bdd gif2webp: Handle frames with odd offsets + disposal to background.
+8301da1 stopwatch.h: fix includes
+6a2209a update ChangeLog (tag: v0.4.2, origin/0.4.2, 0.4.2)
 36cad6a bit_reader.h: cosmetics: fix a typo
 e2ecae6 enc_mips32: workaround gcc-4.9 bug
 243e68d update ChangeLog (tag: v0.4.2-rc2)
@ -74,7 +118,7 @@ c2fc52e restore encode API compatibility
 793368e restore decode API compatibility
 b8984f3 gif2webp: fix compile with giflib 5.1.0
 222f9b1 gif2webp: simplify giflib version checking
-d2cc61b Extend MakeARGB32() to accept Alpha channel. (master)
+d2cc61b Extend MakeARGB32() to accept Alpha channel.
 4595b62 Merge "use explicit size of kErrorMessages[] arrays"
 157de01 Merge "Actuate memory stats for PRINT_MEMORY_INFO"
 fbda2f4 JPEG decoder: delay conversion to YUV to WebPEncode() call
--- a/Makefile.vc
+++ b/Makefile.vc
@ -27,7 +27,7 @@ PLATFORM_LDFLAGS = /SAFESEH
 NOLOGO     = /nologo
 CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c /GS
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
@ -54,6 +54,11 @@ AVX2_FLAGS = /arch:AVX2
 !IF "$(RTLIBCFG)" == "static"
 RTLIB  = /MT
 RTLIBD = /MTd
+!ELSE IF "$(RTLIBCFG)" == "legacy"
+RTLIBCFG = static
+RTLIB  = /MT
+RTLIBD = /MTd
+CFLAGS = $(CFLAGS) /GS- /arch:IA32
 !ELSE
 RTLIB   = /MD
 RTLIBD  = /MDd
@ -139,6 +144,7 @@ CFGSET = TRUE
 !MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
+!MESSAGE   'legacy' will produce a Windows 2000 compatible library.
 !MESSAGE OBJDIR is the path where you like to build (obj, bins, etc.),
 !MESSAGE   defaults to ..\obj

--- a/14
+++ b/14
@ -1,3 +1,17 @@
+- 10/15/15: version 0.4.4
+  This is a binary compatible release.
+  * rescaling out-of-bounds read fix (issue #254)
+  * various build fixes and improvements (issues #253, #259, #262, #267, #268)
+  * container documentation update
+  * gif2webp transparency fix (issue #245)
+
+- 3/3/15: version 0.4.3
+  This is a binary compatible release.
+  * Android / gcc / iOS / MSVS build fixes and improvements
+  * lossless decode fix (issue #239 -- since 0.4.0)
+  * documentation / vwebp updates for animation
+  * multi-threading fix (issue #234)
+
 - 10/13/14: version 0.4.2
  This is a binary compatible release.
  * Android / gcc build fixes
--- a/2
+++ b/2
@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
--- a/4
+++ b/4
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.2
+               \____/____/\_____/_____/____/v0.4.4

 Description:
 ============
@ -596,7 +596,7 @@ Bugs:
 =====

 Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/README.mux
+++ b/README.mux
@ -175,7 +175,7 @@ Bugs:
 =====

 Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/configure.ac
+++ b/configure.ac
@ -1,5 +1,5 @@
-AC_INIT([libwebp], [0.4.2],
-        [http://code.google.com/p/webp/issues],,
+AC_INIT([libwebp], [0.4.4],
+        [https://bugs.chromium.org/p/webp],,
        [http://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
 AC_PREREQ([2.60])
@ -54,6 +54,7 @@ AC_DEFUN([TEST_AND_ADD_CFLAGS],
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-nonliteral])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -46,25 +46,16 @@ for:
  * **Animation.** An image may have multiple frames with pauses between them,
    making it an animation.

-  * **Image Fragmentation.** A single bitstream in WebP has an inherent
-    limitation for width or height of 2^14 pixels, and, when using VP8, a 512
-    KiB limit on the size of the first compressed partition. To support larger
-    images, the format supports images that are composed of multiple fragments,
-    each encoded as a separate bitstream. All fragments logically form a single
-    image: they have common metadata, color profile, etc. Image fragmentation
-    may also improve efficiency for larger images, e.g., grass can be encoded
-    differently than sky.
-
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
 "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
 document are to be interpreted as described in [RFC 2119][].

+Bit numbering in chunk diagrams starts at `0` for the most significant bit
+('MSB 0') as described in [RFC 1166][].
+
 **Note:** Out of the features mentioned above, lossy compression, lossless
 compression, transparency, metadata, color profile and animation are finalized
-and are to be considered stable. On the other hand, image fragmentation is
-experimental as of now, and is open to discussion, feedback and comments.
-The same is indicated using annotation "_status: experimental_" in the relevant
-sections of this document.
+and are to be considered stable.

 Terminology &amp; Basics
 ------------------------
@ -79,7 +70,7 @@ Below are additional terms used throughout this document:
 _Reader/Writer_

 : Code that reads WebP files is referred to as a _reader_, while code that
-writes them is referred to as a _writer_.
+  writes them is referred to as a _writer_.

 _uint16_

@ -101,10 +92,12 @@ _FourCC_
 _1-based_

 : An unsigned integer field storing values offset by `-1`. e.g., Such a field
-would store value _25_ as _24_.
+  would store value _25_ as _24_.

-RIFF file format
+
+RIFF File Format
 ----------------
+
 The WebP file format is based on the RIFF (resource interchange file format)
 document format.

@ -144,7 +137,8 @@ _ChunkHeader('ABCD')_
 chunks that apply to any RIFF file format, while FourCCs specific to a file
 format are all lowercase. WebP does not follow this convention.

-WebP file header
+
+WebP File Header
 ----------------

     0                   1                   2                   3
@ -164,8 +158,8 @@ WebP file header
 File Size: 32 bits (_uint32_)

 : The size of the file in bytes starting at offset 8. The maximum value of
-this field is 2^32 minus 10 bytes and thus the size of the whole file is at
-most 4GiB minus 2 bytes.
+  this field is 2^32 minus 10 bytes and thus the size of the whole file is at
+  most 4GiB minus 2 bytes.

 'WEBP': 32 bits

@ -177,7 +171,8 @@ the 'WEBP' FourCC. The file SHOULD NOT contain anything after it. As the size
 of any chunk is even, the size given by the RIFF header is also even. The
 contents of individual chunks will be described in the following sections.

-Simple file format (lossy)
+
+Simple File Format (Lossy)
 --------------------------

 This layout SHOULD be used if the image requires _lossy_ encoding and does not
@ -215,7 +210,8 @@ width and height. That is assumed to be the width and height of the canvas.
 The VP8 specification describes how to decode the image into Y'CbCr
 format. To convert to RGB, Rec. 601 SHOULD be used.

-Simple file format (lossless)
+
+Simple File Format (Lossless)
 -----------------------------

 **Note:** Older readers may not support files using the lossless format.
@ -253,7 +249,8 @@ The current specification of the VP8L bitstream can be found at
 contains the VP8L image width and height. That is assumed to be the width
 and height of the canvas.

-Extended file format
+
+Extended File Format
 --------------------

 **Note:** Older readers may not support files using the extended format.
@ -274,13 +271,15 @@ An extended format file consists of:

  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_

-For a _still image_, the _image data_ consists of a single frame, whereas for
-an _animated image_, it consists of multiple frames. More details about frames
-can be found in the [Animation](#animation) section.
+For a _still image_, the _image data_ consists of a single frame, which is made
+up of:

-Moreover, each frame can be fragmented or non-fragmented, as will be described
-in the [Extended WebP file header](#extended_header) section. More details about
-fragments can be found in the [Fragments](#fragments) section.
+  * An optional [alpha subchunk](#alpha).
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l).
+
+For an _animated image_, the _image data_ consists of multiple frames. More
+details about frames can be found in the [Animation](#animation) section.

 All chunks SHOULD be placed in the same order as listed above. If a chunk
 appears in the wrong place, the file is invalid, but readers MAY parse the
@ -302,7 +301,7 @@ Extended WebP file header:
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                      ChunkHeader('VP8X')                      |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |Rsv|I|L|E|X|A|F|                   Reserved                    |
+    |Rsv|I|L|E|X|A|R|                   Reserved                    |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |          Canvas Width Minus One               |             ...
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@ -320,7 +319,7 @@ ICC profile (I): 1 bit
 Alpha (L): 1 bit

 : Set if any of the frames of the image contain transparency information
-("alpha").
+  ("alpha").

 EXIF metadata (E): 1 bit

@ -333,11 +332,11 @@ XMP metadata (X): 1 bit
 Animation (A): 1 bit

 : Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
-used to control the animation.
+  used to control the animation.

-Image Fragmentation (F): 1 bit _\[status: experimental\]_
+Reserved (R): 1 bit

-: Set if any of the frames in the image are represented by fragments.
+: SHOULD be `0`.

 Reserved: 24 bits

@ -382,9 +381,9 @@ animation.
 Background Color: 32 bits (_uint32_)

 : The default background color of the canvas in \[Blue, Green, Red, Alpha\]
-byte order. This color MAY be used to fill the unused space on the canvas around
-the frames, as well as the transparent pixels of the first frame. Background
-color is also used when disposal method is `1`.
+  byte order. This color MAY be used to fill the unused space on the canvas
+  around the frames, as well as the transparent pixels of the first frame.
+  Background color is also used when disposal method is `1`.

 **Note**:

@ -394,6 +393,9 @@ color is also used when disposal method is `1`.
  * Viewer applications SHOULD treat the background color value as a hint, and
    are not required to use it.

+  * The canvas is cleared at the start of each loop. The background color MAY be
+    used to achieve this.
+
 Loop Count: 16 bits (_uint16_)

 : The number of times to loop the animation. `0` means infinitely.
@ -402,7 +404,6 @@ This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
 If the _Animation_ flag is not set and this chunk is present, it
 SHOULD be ignored.

-
 ANMF chunk:

 For animated images, this chunk contains information about a _single_ frame.
@ -445,8 +446,8 @@ Frame Height Minus One: 24 bits (_uint24_)
 Frame Duration: 24 bits (_uint24_)

 : The time to wait before displaying the next frame, in 1 millisecond units.
-In particular, frame duration of 0 is useful when one wants to update multiple
-areas of the canvas at once during the animation.
+  In particular, frame duration of 0 is useful when one wants to update
+  multiple areas of the canvas at once during the animation.

 Reserved: 6 bits

@ -454,28 +455,28 @@ Reserved: 6 bits

 Blending method (B): 1 bit

-: Indicates how transparent pixels of _the current frame_ are to be blended with
-corresponding pixels of the previous canvas:
+: Indicates how transparent pixels of _the current frame_ are to be blended
+  with corresponding pixels of the previous canvas:

-  * `0`: Use alpha blending. After disposing of the previous frame, render the
-    current frame on the canvas using [alpha-blending](#alpha-blending). If the
-    current frame does not have an alpha channel, assume alpha value of 255,
-    effectively replacing the rectangle.
+    * `0`: Use alpha blending. After disposing of the previous frame, render the
+      current frame on the canvas using [alpha-blending](#alpha-blending). If
+      the current frame does not have an alpha channel, assume alpha value of
+      255, effectively replacing the rectangle.

-  * `1`: Do not blend. After disposing of the previous frame, render the
-    current frame on the canvas by overwriting the rectangle covered by the
-    current frame.
+    * `1`: Do not blend. After disposing of the previous frame, render the
+      current frame on the canvas by overwriting the rectangle covered by the
+      current frame.

 Disposal method (D): 1 bit

-: Indicates how _the current frame_ is to be treated after it has been displayed
-(before rendering the next frame) on the canvas:
+: Indicates how _the current frame_ is to be treated after it has been
+  displayed (before rendering the next frame) on the canvas:

-  * `0`: Do not dispose. Leave the canvas as is.
+    * `0`: Do not dispose. Leave the canvas as is.

-  * `1`: Dispose to background color. Fill the _rectangle_ on the canvas covered
-    by the _current frame_ with background color specified in the
-    [ANIM chunk](#anim_chunk).
+    * `1`: Dispose to background color. Fill the _rectangle_ on the canvas
+      covered by the _current frame_ with background color specified in the
+      [ANIM chunk](#anim_chunk).

 **Notes**:

@ -506,9 +507,7 @@ Disposal method (D): 1 bit

 Frame Data: _Chunk Size_ - `16` bytes

-: For a fragmented frame, it consists of multiple [fragment chunks](#fragments).
-
-: For a non-fragmented frame, it consists of:
+: Consists of:

  * An optional [alpha subchunk](#alpha) for the frame.

@ -519,49 +518,6 @@ Frame Data: _Chunk Size_ - `16` bytes
 **Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
 _padded_ chunks as described by the [RIFF file format](#riff-file-format).

-#### Fragments _\[status: experimental\]_
-
-For images that are represented by fragments, this chunk contains data for
-a single fragment. If the _Image Fragmentation Flag_ is not set, then this chunk
-SHOULD NOT be present.
-
-     0                   1                   2                   3
-     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                      ChunkHeader('FRGM')                      |
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                  Fragment X                   |             ...
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    ...       Fragment Y            |         Fragment Data         |
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-Fragment X: 24 bits (_uint24_)
-
-: The X coordinate of the upper left corner of the fragment is `Fragment X * 2`
-
-Fragment Y: 24 bits (_uint24_)
-
-: The Y coordinate of the upper left corner of the fragment is `Fragment Y * 2`
-
-Fragment Data: _Chunk Size_ - `6` bytes
-
-: It contains:
-
-  * An optional [alpha subchunk](#alpha) for the fragment.
-  * The [bitstream subchunk](#bitstream-vp8vp8l) for the fragment.
-  * An optional list of [unknown chunks](#unknown-chunks).
-
-Note: The width and height of the fragment is obtained from the bitstream
-subchunk.
-
-The fragments of a frame SHOULD have the following properties:
-
-  * They collectively cover the whole frame.
-
-  * No pair of fragments have any overlapping region on the frame.
-
-  * No portion of any fragment should be located outside of the canvas.
-
 #### Alpha

     0                   1                   2                   3
@ -579,20 +535,20 @@ Reserved (Rsv): 2 bits
 Pre-processing (P): 2 bits

 : These INFORMATIVE bits are used to signal the pre-processing that has
-been performed during compression. The decoder can use this information to
-e.g. dither the values or smooth the gradients prior to display.
+  been performed during compression. The decoder can use this information to
+  e.g. dither the values or smooth the gradients prior to display.

-  * `0`: no pre-processing
-  * `1`: level reduction
+    * `0`: no pre-processing
+    * `1`: level reduction

 Filtering method (F): 2 bits

 : The filtering method used:

-  * `0`: None.
-  * `1`: Horizontal filter.
-  * `2`: Vertical filter.
-  * `3`: Gradient filter.
+    * `0`: None.
+    * `1`: Horizontal filter.
+    * `2`: Vertical filter.
+    * `3`: Gradient filter.

 For each pixel, filtering is performed using the following calculations.
 Assume the alpha values surrounding the current `X` position are labeled as:
@ -636,15 +592,15 @@ Compression method (C): 2 bits

 : The compression method used:

-  * `0`: No compression.
-  * `1`: Compressed using the WebP lossless format.
+    * `0`: No compression.
+    * `1`: Compressed using the WebP lossless format.

 Alpha bitstream: _Chunk Size_ - `1` bytes

 : Encoded alpha bitstream.

-This optional chunk contains encoded alpha data for this frame/fragment. A
-frame/fragment containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+This optional chunk contains encoded alpha data for this frame. A frame
+containing a 'VP8L' chunk SHOULD NOT contain this chunk.

 **Rationale**: The transparency information is already part of the 'VP8L'
 chunk.
@ -675,15 +631,15 @@ compression method is '0') or compressed using the lossless format

 #### Bitstream (VP8/VP8L)

-This chunk contains compressed bitstream data for a single frame/fragment.
+This chunk contains compressed bitstream data for a single frame.

 A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
 significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
 "VP8L" as its tag.

 The formats of VP8 and VP8L chunks are as described in sections
-[Simple file format (lossy)](#simple-file-format-lossy)
-and [Simple file format (lossless)](#simple-file-format-lossless) respectively.
+[Simple File Format (Lossy)](#simple-file-format-lossy)
+and [Simple File Format (Lossless)](#simple-file-format-lossless) respectively.

 #### Color profile

@ -731,7 +687,6 @@ EXIF Metadata: _Chunk Size_ bytes

 : image metadata in EXIF format.

-
 XMP chunk:

     0                   1                   2                   3
@ -762,47 +717,17 @@ A file MAY contain unknown chunks:

  * At the end of the file as described in [Extended WebP file
    header](#extended_header) section.
-  * At the end of FRGM and ANMF chunks as described in [Fragments](#fragments)
-    and [Animation](#animation) sections.
+  * At the end of ANMF chunks as described in the
+    [Animation](#animation) section.

 Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
 original order (unless they specifically intend to modify these chunks).

-### Assembling the Canvas from fragments/frames
+### Assembling the Canvas from frames

-Here we provide an overview of how a reader should assemble a canvas in case
-of a fragmented-image and in case of an animated image. The notation
-_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
-
-Displaying a _fragmented image_ canvas MUST be equivalent to the following
-pseudocode: _\[status: experimental\]_
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-assert VP8X.flags.hasFragments
-canvas ← new black image of size VP8X.canvasWidth x VP8X.canvasHeight.
-frgm_params ← nil
-for chunk in image_data:
-    assert chunk.tag is "FRGM"
-    frgm_params.fragmentX = Fragment X
-    frgm_params.fragmentY = Fragment Y
-    for subchunk in 'Fragment Data':
-        if subchunk.tag == "ALPH":
-            assert alpha subchunks not found in 'Fragment Data' earlier
-            frgm_params.alpha = alpha_data
-        else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
-            assert bitstream subchunks not found in 'Fragment Data' earlier
-            frgm_params.bitstream = bitstream_data
-    frgm_params.fragmentWidth = Width extracted from bitstream subchunk
-    frgm_params.fragmentHeight = Height extracted from bitstream subchunk
-    assert VP8X.canvasWidth >=
-        frgm_params.fragmentX + frgm_params.fragmentWidth
-    assert VP8X.canvasHeight >=
-        frgm_params.fragmentY + frgm_params.fragmentHeight
-    assert fragment has the properties mentioned in "Image Fragments" section.
-    render fragment with frame_params.alpha and frame_params.bitstream on canvas
-    with top-left corner in (frgm_params.fragmentX, frgm_params.fragmentY).
-canvas contains the decoded canvas.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Here we provide an overview of how a reader should assemble a canvas in the
+case of an animated image. The notation _VP8X.field_ means the field in the
+'VP8X' chunk with the same description.

 Displaying an _animated image_ canvas MUST be equivalent to the following
 pseudocode:
@ -810,28 +735,25 @@ pseudocode:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 assert VP8X.flags.hasAnimation
 canvas ← new image of size VP8X.canvasWidth x VP8X.canvasHeight with
-background color ANIM.background_color.
+         background color ANIM.background_color.
 loop_count ← ANIM.loopCount
 dispose_method ← ANIM.disposeMethod
 if loop_count == 0:
    loop_count = ∞
 frame_params ← nil
-for loop = 0, ..., loop_count - 1
-    assert next chunk in image_data is ANMF
-    frame_params.frameX = Frame X
-    frame_params.frameY = Frame Y
-    frame_params.frameWidth = Frame Width Minus One + 1
-    frame_params.frameHeight = Frame Height Minus One + 1
-    frame_params.frameDuration = Frame Duration
-    assert VP8X.canvasWidth >= frame_params.frameX + frame_params.frameWidth
-    assert VP8X.canvasHeight >= frame_params.frameY + frame_params.frameHeight
-    if VP8X.flags.hasFragments and first subchunk in 'Frame Data' is FRGM
-        // Fragmented frame.
-        frame_params.{bitstream,alpha} = canvas decoded from subchunks in
-                                         'Frame Data' as per the pseudocode for
-                                         _fragmented image_ above.
-    else
-        // Non-fragmented frame.
+assert next chunk in image_data is ANMF
+for loop = 0..loop_count - 1
+    clear canvas to ANIM.background_color or application defined color
+    until eof or non-ANMF chunk
+        frame_params.frameX = Frame X
+        frame_params.frameY = Frame Y
+        frame_params.frameWidth = Frame Width Minus One + 1
+        frame_params.frameHeight = Frame Height Minus One + 1
+        frame_params.frameDuration = Frame Duration
+        frame_right = frame_params.frameX + frame_params.frameWidth
+        frame_bottom = frame_params.frameY + frame_params.frameHeight
+        assert VP8X.canvasWidth >= frame_right
+        assert VP8X.canvasHeight >= frame_bottom
        for subchunk in 'Frame Data':
            if subchunk.tag == "ALPH":
                assert alpha subchunks not found in 'Frame Data' earlier
@ -839,14 +761,15 @@ for loop = 0, ..., loop_count - 1
            else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
                assert bitstream subchunks not found in 'Frame Data' earlier
                frame_params.bitstream = bitstream_data
-    render frame with frame_params.alpha and frame_params.bitstream on canvas
-    with top-left corner in (frame_params.frameX, frame_params.frameY), using
-    dispose method dispose_method.
-    Show the contents of the image for frame_params.frameDuration * 1ms.
-canvas contains the decoded canvas.
+        render frame with frame_params.alpha and frame_params.bitstream on
+            canvas with top-left corner at (frame_params.frameX,
+            frame_params.frameY), using dispose method dispose_method.
+        canvas contains the decoded image.
+        Show the contents of the canvas for frame_params.frameDuration * 1ms.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Example file layouts
+
+Example File Layouts
 --------------------

 A lossy encoded image with alpha may look as follows:
@ -878,17 +801,6 @@ RIFF/WEBP
 +- XMP  (metadata)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-A fragmented image may look as follows:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-RIFF/WEBP
-+- VP8X (descriptions of features used)
-+- FRGM (fragment1 parameters + data)
-+- FRGM (fragment2 parameters + data)
-+- FRGM (fragment3 parameters + data)
-+- FRGM (fragment4 parameters + data)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 An animated image with EXIF metadata may look as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -903,7 +815,8 @@ RIFF/WEBP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 [vp8spec]:  http://tools.ietf.org/html/rfc6386
-[webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[webpllspec]: https://chromium.googlesource.com/webm/libwebp/+/master/doc/webp-lossless-bitstream-spec.txt
 [iccspec]: http://www.color.org/icc_specs2.xalter
 [metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
+[rfc 1166]: http://tools.ietf.org/html/rfc1166
 [rfc 2119]: http://tools.ietf.org/html/rfc2119
--- a/examples/Android.mk
+++ b/examples/Android.mk
@ -1,5 +1,8 @@
 LOCAL_PATH := $(call my-dir)

+################################################################################
+# libexample_util
+
 include $(CLEAR_VARS)

 LOCAL_SRC_FILES := \
@ -12,6 +15,9 @@ LOCAL_MODULE := example_util

 include $(BUILD_STATIC_LIBRARY)

+################################################################################
+# cwebp
+
 include $(CLEAR_VARS)

 # Note: to enable jpeg/png encoding the sources from AOSP can be used with
@ -32,6 +38,9 @@ LOCAL_MODULE := cwebp

 include $(BUILD_EXECUTABLE)

+################################################################################
+# dwebp
+
 include $(CLEAR_VARS)

 LOCAL_SRC_FILES := \
@ -44,3 +53,19 @@ LOCAL_STATIC_LIBRARIES := example_util webp
 LOCAL_MODULE := dwebp

 include $(BUILD_EXECUTABLE)
+
+################################################################################
+# webpmux
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    webpmux.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+LOCAL_STATIC_LIBRARIES := example_util webpmux webp
+
+LOCAL_MODULE := webpmux_example
+
+include $(BUILD_EXECUTABLE)
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -22,6 +22,7 @@

 #ifdef WEBP_HAVE_PNG
 #include <png.h>
+#include <setjmp.h>   // note: this must be included *after* png.h
 #endif

 #ifdef HAVE_WINCODEC_H
@ -192,8 +193,8 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_RGBA);
-  png_structp png;
-  png_infop info;
+  volatile png_structp png;
+  volatile png_infop info;
  png_uint_32 y;

  png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
@ -203,11 +204,11 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  }
  info = png_create_info_struct(png);
  if (info == NULL) {
-    png_destroy_write_struct(&png, NULL);
+    png_destroy_write_struct((png_structpp)&png, NULL);
    return 0;
  }
  if (setjmp(png_jmpbuf(png))) {
-    png_destroy_write_struct(&png, &info);
+    png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
    return 0;
  }
  png_init_io(png, out_file);
@ -221,7 +222,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
    png_write_rows(png, &row, 1);
  }
  png_write_end(png, info);
-  png_destroy_write_struct(&png, &info);
+  png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
  return 1;
 }
 #else    // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG
@ -244,10 +245,10 @@ static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  uint32_t y;

  if (alpha) {
-    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+    fprintf(fout, "P7\nWIDTH %u\nHEIGHT %u\nDEPTH 4\nMAXVAL 255\n"
                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
  } else {
-    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+    fprintf(fout, "P6\n%u %u\n255\n", width, height);
  }
  for (y = 0; y < height; ++y) {
    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
@ -404,7 +405,7 @@ static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  const int a_stride = buffer->u.YUVA.a_stride;
  uint32_t y;
  assert(a != NULL);
-  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  fprintf(fout, "P5\n%u %u\n255\n", width, height);
  for (y = 0; y < height; ++y) {
    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
      return 0;
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -46,18 +46,7 @@

 //------------------------------------------------------------------------------

-static int transparent_index;  // Index of transparent color in the map.
-
-static void ResetFrameInfo(WebPMuxFrameInfo* const info) {
-  WebPDataInit(&info->bitstream);
-  info->x_offset = 0;
-  info->y_offset = 0;
-  info->duration = 0;
-  info->id = WEBP_CHUNK_ANMF;
-  info->dispose_method = WEBP_MUX_DISPOSE_NONE;
-  info->blend_method = WEBP_MUX_BLEND;
-  transparent_index = -1;  // Opaque frame by default.
-}
+static int transparent_index = -1;  // Opaque frame by default.

 static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
                                      size_t* const kmax_ptr) {
@ -270,7 +259,8 @@ int main(int argc, const char *argv[]) {
  GifFileType* gif = NULL;
  WebPConfig config;
  WebPPicture frame;
-  WebPMuxFrameInfo info;
+  int duration = 0;
+  FrameDisposeMethod orig_dispose = FRAME_DISPOSE_NONE;
  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
  WebPFrameCache* cache = NULL;

@ -290,14 +280,11 @@ int main(int argc, const char *argv[]) {
  size_t kmax = 0;
  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.

-  ResetFrameInfo(&info);
-
  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    return -1;
  }
  config.lossless = 1;  // Use lossless compression by default.
-  config.image_hint = WEBP_HINT_GRAPH;   // always low-color

  if (argc == 1) {
    Help();
@ -499,7 +486,8 @@ int main(int argc, const char *argv[]) {
          goto End;
        }

-        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, &frame, &info)) {
+        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, orig_dispose,
+                                    duration, &frame)) {
          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
          fprintf(stderr, "Error code: %d\n", frame.error_code);
        }
@ -515,7 +503,9 @@ int main(int argc, const char *argv[]) {
        // In GIF, graphic control extensions are optional for a frame, so we
        // may not get one before reading the next frame. To handle this case,
        // we reset frame properties to reasonable defaults for the next frame.
-        ResetFrameInfo(&info);
+        orig_dispose = FRAME_DISPOSE_NONE;
+        duration = 0;
+        transparent_index = -1;  // Opaque frame by default.
        break;
      }
      case EXTENSION_RECORD_TYPE: {
@ -533,20 +523,19 @@ int main(int argc, const char *argv[]) {
            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
            if (data[0] != 4) goto End;
-            info.duration = delay * 10;  // Duration is in 1 ms units for WebP.
-            if (dispose == 3) {
-              static int warning_printed = 0;
-              if (!warning_printed) {
-                fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE unsupported.\n");
-                warning_printed = 1;
-              }
-              // failsafe. TODO(urvang): emulate the correct behaviour by
-              // recoding the whole frame.
-              info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
-            } else {
-              info.dispose_method =
-                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
-                                 : WEBP_MUX_DISPOSE_NONE;
+            duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            switch (dispose) {
+              case 3:
+                orig_dispose = FRAME_DISPOSE_RESTORE_PREVIOUS;
+                break;
+              case 2:
+                orig_dispose = FRAME_DISPOSE_BACKGROUND;
+                break;
+              case 1:
+              case 0:
+              default:
+                orig_dispose = FRAME_DISPOSE_NONE;
+                break;
            }
            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
            break;
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -29,6 +29,13 @@ extern "C" {

 struct WebPPicture;

+// Includes all disposal methods, even the ones not supported by WebP bitstream.
+typedef enum FrameDisposeMethod {
+  FRAME_DISPOSE_NONE,
+  FRAME_DISPOSE_BACKGROUND,
+  FRAME_DISPOSE_RESTORE_PREVIOUS
+} FrameDisposeMethod;
+
 typedef struct {
  int x_offset, y_offset, width, height;
 } WebPFrameRect;
@ -53,15 +60,15 @@ WebPFrameCache* WebPFrameCacheNew(int width, int height,
 // Release all the frame data from 'cache' and free 'cache'.
 void WebPFrameCacheDelete(WebPFrameCache* const cache);

-// Given an image described by 'frame', 'info' and 'orig_rect', optimize it for
-// WebP, encode it and add it to 'cache'. 'orig_rect' can be NULL.
-// This takes care of frame disposal too, according to 'info->dispose_method'.
+// Given an image described by 'frame', 'rect', 'dispose_method' and 'duration',
+// optimize it for WebP, encode it and add it to 'cache'. 'rect' can be NULL.
+// This takes care of frame disposal too, according to 'dispose_method'.
 // Returns false in case of error (and sets frame->error_code accordingly).
 int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
                           const WebPConfig* const config,
-                           const WebPFrameRect* const orig_rect,
-                           WebPPicture* const frame,
-                           WebPMuxFrameInfo* const info);
+                           const WebPFrameRect* const rect,
+                           FrameDisposeMethod dispose_method, int duration,
+                           WebPPicture* const frame);

 // Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
 // true, prints the information about these frames.
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@ -14,6 +14,8 @@
 #ifndef WEBP_EXAMPLES_STOPWATCH_H_
 #define WEBP_EXAMPLES_STOPWATCH_H_

+#include "webp/types.h"
+
 #if defined _WIN32 && !defined __GNUC__
 #include <windows.h>

@ -37,6 +39,7 @@ static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {


 #else    /* !_WIN32 */
+#include <string.h>  // memcpy
 #include <sys/time.h>

 typedef struct timeval Stopwatch;
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -42,8 +42,6 @@
 #define snprintf _snprintf
 #endif

-static void Help(void);
-
 // Unfortunate global variables. Gathered into a struct for comfort.
 static struct {
  int has_animation;
@ -82,6 +80,16 @@ static void ClearParams(void) {
  kParams.dmux = NULL;
 }

+// Sets the previous frame to the dimensions of the canvas and has it dispose
+// to background to cause the canvas to be cleared.
+static void ClearPreviousFrame(void) {
+  WebPIterator* const prev = &kParams.prev_frame;
+  prev->width = kParams.canvas_width;
+  prev->height = kParams.canvas_height;
+  prev->x_offset = prev->y_offset = 0;
+  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+}
+
 // -----------------------------------------------------------------------------
 // Color profile handling
 static int ApplyColorProfile(const WebPData* const profile,
@ -181,6 +189,8 @@ static void decode_callback(int what) {
        if (WebPDemuxGetFrame(kParams.dmux, 1, curr)) {
          --kParams.loop_count;
          kParams.done = (kParams.loop_count == 0);
+          if (kParams.done) return;
+          ClearPreviousFrame();
        } else {
          kParams.decoding_error = 1;
          kParams.done = 1;
@ -298,19 +308,24 @@ static void HandleDisplay(void) {
    //              they will be incorrect if the window is resized.
    // glScissor() takes window coordinates (0,0 at bottom left).
    int window_x, window_y;
+    int frame_w, frame_h;
    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
      // Clear the previous frame rectangle.
      window_x = prev->x_offset;
      window_y = kParams.canvas_height - prev->y_offset - prev->height;
+      frame_w = prev->width;
+      frame_h = prev->height;
    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
      // We simulate no-blending behavior by first clearing the current frame
      // rectangle (to a checker-board) and then alpha-blending against it.
      window_x = curr->x_offset;
      window_y = kParams.canvas_height - curr->y_offset - curr->height;
+      frame_w = curr->width;
+      frame_h = curr->height;
    }
    glEnable(GL_SCISSOR_TEST);
    // Only update the requested area, not the whole canvas.
-    glScissor(window_x, window_y, prev->width, prev->height);
+    glScissor(window_x, window_y, frame_w, frame_h);

    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();
@ -395,7 +410,6 @@ int main(int argc, char *argv[]) {
  int c;
  WebPDecoderConfig* const config = &kParams.config;
  WebPIterator* const curr = &kParams.curr_frame;
-  WebPIterator* const prev = &kParams.prev_frame;

  if (!WebPInitDecoderConfig(config)) {
    fprintf(stderr, "Library version mismatch!\n");
@ -486,10 +500,7 @@ int main(int argc, char *argv[]) {
    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
  }

-  prev->width = kParams.canvas_width;
-  prev->height = kParams.canvas_height;
-  prev->x_offset = prev->y_offset = 0;
-  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+  ClearPreviousFrame();

  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
  kParams.has_color_profile =
--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -15,6 +15,7 @@
 #include "webp/config.h"
 #endif

+#include <assert.h>
 #include <stdio.h>

 #ifdef HAVE_WINCODEC_H
@ -72,6 +73,12 @@ WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
 WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
                 0xf5c7ad2d, 0x6a8d, 0x43dd,
                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppBGRA_,
+                 0x1562ff7c, 0xd352, 0x46f9,
+                 0x97, 0x9e, 0x42, 0x97, 0x6b, 0x79, 0x22, 0x46);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppRGBA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x16);

 static HRESULT OpenInputStream(const char* filename, IStream** stream) {
  HRESULT hr = S_OK;
@ -109,6 +116,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
    IFS(IWICBitmapFrameDecode_GetColorContexts(frame,
                                               count, color_contexts,
                                               &num_color_contexts));
+    assert(FAILED(hr) || num_color_contexts <= count);
    for (i = 0; SUCCEEDED(hr) && i < num_color_contexts; ++i) {
      WICColorContextType type;
      IFS(IWICColorContext_GetType(color_contexts[i], &type));
@ -116,7 +124,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
        UINT size;
        IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
                                             0, NULL, &size));
-        if (size > 0) {
+        if (SUCCEEDED(hr) && size > 0) {
          iccp->bytes = (uint8_t*)malloc(size);
          if (iccp->bytes == NULL) {
            hr = E_OUTOFMEMORY;
@ -194,7 +202,11 @@ static int HasAlpha(IWICImagingFactory* const factory,
    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
                IsEqualGUID(MAKE_REFGUID(pixel_format),
-                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppBGRA_));
  }
  return has_alpha;
 }
@ -261,7 +273,7 @@ int ReadPictureWithWIC(const char* const filename,
  IFS(IWICBitmapFrameDecode_GetPixelFormat(frame, &src_pixel_format));
  IFS(IWICBitmapDecoder_GetContainerFormat(decoder, &src_container_format));

-  if (keep_alpha) {
+  if (SUCCEEDED(hr) && keep_alpha) {
    const GUID** guid;
    for (guid = kAlphaContainers; *guid != NULL; ++guid) {
      if (IsEqualGUID(MAKE_REFGUID(src_container_format),
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -26,7 +26,9 @@ readonly OLDPATH=${PATH}

 # Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
 # Note that iPhoneOS-V6 support is not available with the iOS6 SDK.
-readonly PLATFORMS="iPhoneSimulator iPhoneOS-V7 iPhoneOS-V7s iPhoneOS-V7-arm64"
+PLATFORMS="iPhoneSimulator iPhoneSimulator64"
+PLATFORMS+=" iPhoneOS-V7 iPhoneOS-V7s iPhoneOS-V7-arm64"
+readonly PLATFORMS
 readonly SRCDIR=$(dirname $0)
 readonly TOPDIR=$(pwd)
 readonly BUILDDIR="${TOPDIR}/iosbuild"
@ -39,6 +41,8 @@ LIBLIST=''
 if [[ -z "${SDK}" ]]; then
  echo "iOS SDK not available"
  exit 1
+elif [[ ${SDK%%.*} -gt 8 ]]; then
+  EXTRA_CFLAGS="-fembed-bitcode"
 elif [[ ${SDK} < 6.0 ]]; then
  echo "You need iOS SDK version 6.0 or above"
  exit 1
@ -78,6 +82,9 @@ for PLATFORM in ${PLATFORMS}; do
  elif [[ "${PLATFORM}" == "iPhoneOS-V6" ]]; then
    PLATFORM="iPhoneOS"
    ARCH="armv6"
+  elif [[ "${PLATFORM}" == "iPhoneSimulator64" ]]; then
+    PLATFORM="iPhoneSimulator"
+    ARCH="x86_64"
  else
    ARCH="i386"
  fi
@ -89,7 +96,7 @@ for PLATFORM in ${PLATFORMS}; do
  SDKROOT="${PLATFORMSROOT}/"
  SDKROOT+="${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
  CFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT} -O3 -DNDEBUG"
-  CFLAGS+=" -miphoneos-version-min=6.0"
+  CFLAGS+=" -miphoneos-version-min=6.0 ${EXTRA_CFLAGS}"

  set -x
  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
--- a/makefile.unix
+++ b/makefile.unix
@ -67,6 +67,8 @@ EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
+EXTRA_FLAGS += -Wformat-security -Wformat-nonliteral
+
 # EXTRA_FLAGS += -Wvla

 # AVX2-specific flags:
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "Oct 13, 2014"
+.TH CWEBP 1 "October 19, 2015"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -259,7 +259,7 @@ Only print brief information (output file size and PSNR) for testing purpose.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "July 22, 2014"
+.TH DWEBP 1 "October 19, 2015"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -106,7 +106,7 @@ Disable all assembly optimizations.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "March 7, 2014"
+.TH GIF2WEBP 1 "October 19, 2015"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -111,7 +111,7 @@ Do not print anything.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH VWEBP 1 "July 23, 2014"
+.TH VWEBP 1 "October 19, 2015"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
@ -65,7 +65,7 @@ Quit.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "August 28, 2014"
+.TH WEBPMUX 1 "October 19, 2015"
 .SH NAME
 webpmux \- create animated WebP files from non\-animated WebP images, extract
 frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
@ -129,7 +129,7 @@ The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting\-patches/
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -35,7 +35,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 5:2:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 5:4:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -47,7 +47,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:2:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:4:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -33,6 +33,11 @@ static int IsValidColorspace(int webp_csp_mode) {
  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }

+// strictly speaking, the very last (or first, if flipped) row
+// doesn't require padding.
+#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
  const WEBP_CSP_MODE mode = buffer->colorspace;
@ -42,20 +47,22 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    ok = 0;
  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int uv_width  = (width  + 1) / 2;
+    const int uv_height = (height + 1) / 2;
    const int y_stride = abs(buf->y_stride);
    const int u_stride = abs(buf->u_stride);
    const int v_stride = abs(buf->v_stride);
    const int a_stride = abs(buf->a_stride);
-    const uint64_t y_size = (uint64_t)y_stride * height;
-    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)a_stride * height;
+    const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
+    const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
+    const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
+    const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
    ok &= (y_stride >= width);
-    ok &= (u_stride >= (width + 1) / 2);
-    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (u_stride >= uv_width);
+    ok &= (v_stride >= uv_width);
    ok &= (buf->y != NULL);
    ok &= (buf->u != NULL);
    ok &= (buf->v != NULL);
@ -67,13 +74,14 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
-    const uint64_t size = (uint64_t)stride * height;
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
+#undef MIN_BUFFER_SIZE

 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  const int w = buffer->width;
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -357,30 +357,33 @@ static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
 }

 // Partition #0
-static int CopyParts0Data(WebPIDecoder* const idec) {
+static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  VP8BitReader* const br = &dec->br_;
-  const size_t psize = br->buf_end_ - br->buf_;
+  const size_t part_size = br->buf_end_ - br->buf_;
  MemBuffer* const mem = &idec->mem_;
  assert(!idec->is_lossless_);
  assert(mem->part0_buf_ == NULL);
-  assert(psize > 0);
-  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
+  // the following is a format limitation, no need for runtime check:
+  assert(part_size <= mem->part0_size_);
+  if (part_size == 0) {   // can't have zero-size partition #0
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
  if (mem->mode_ == MEM_MODE_APPEND) {
    // We copy and grab ownership of the partition #0 data.
-    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, psize);
+    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, part_size);
    if (part0_buf == NULL) {
-      return 0;
+      return VP8_STATUS_OUT_OF_MEMORY;
    }
-    memcpy(part0_buf, br->buf_, psize);
+    memcpy(part0_buf, br->buf_, part_size);
    mem->part0_buf_ = part0_buf;
    br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + psize;
+    br->buf_end_ = part0_buf + part_size;
  } else {
    // Else: just keep pointers to the partition #0's data in dec_->br_.
  }
-  mem->start_ += psize;
-  return 1;
+  mem->start_ += part_size;
+  return VP8_STATUS_OK;
 }

 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
@ -414,8 +417,10 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
                                       io->width, io->height);
  VP8InitDithering(params->options, dec);
-  if (!CopyParts0Data(idec)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
+
+  dec->status_ = CopyParts0Data(idec);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
  }

  // Finish setting up the decoding parameters. Will call io->setup().
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -322,37 +322,31 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
  size_t tmp_size;
-  int32_t* work;
+  rescaler_t* work;

  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
  if (has_alpha) {
    tmp_size += work_size * sizeof(*work);
  }
-  p->memory = WebPSafeCalloc(1ULL, tmp_size);
+  p->memory = WebPSafeMalloc(1ULL, tmp_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size + uv_work_size);
  p->emit = EmitRescaledYUV;

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + work_size + 2 * uv_work_size);
    p->emit_alpha = EmitRescaledAlphaYUV;
    WebPInitAlphaProcessing();
@ -375,9 +369,9 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
         WebPRescalerHasPendingOutput(&p->scaler_u)) {
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y, 0);
-    WebPRescalerExportRow(&p->scaler_u, 0);
-    WebPRescalerExportRow(&p->scaler_v, 0);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
@ -425,7 +419,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      const uint32_t alpha_value = p->scaler_a.dst[i];
      dst[4 * i] = alpha_value;
@ -458,7 +452,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      // Fill in the alpha value (converted to 4 bits).
      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@ -495,7 +489,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const int uv_in_width  = (io->mb_w + 1) >> 1;
  const int uv_in_height = (io->mb_h + 1) >> 1;
  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
-  int32_t* work;  // rescalers work area
+  rescaler_t* work;  // rescalers work area
  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
  size_t tmp_size1, tmp_size2, total_size;

@ -506,30 +500,26 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size2 += out_width;
  }
  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeCalloc(1ULL, total_size);
+  p->memory = WebPSafeMalloc(1ULL, total_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  tmp = (uint8_t*)(work + tmp_size1);
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work + 0 * work_size);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 1 * work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 2 * work_size);
  p->emit = EmitRescaledRGB;

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + 3 * work_size);
    p->emit_alpha = EmitRescaledAlphaRGB;
    if (p->output->colorspace == MODE_RGBA_4444 ||
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -562,6 +562,7 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
    }
    block->non_zero_y_ = 0;
    block->non_zero_uv_ = 0;
+    block->dither_ = 0;
  }

  if (dec->filter_type_ > 0) {  // store filter info
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -31,7 +31,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 2
+#define DEC_REV_VERSION 4

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@ -390,13 +390,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
  const int in_height = io->mb_h;
  const int out_height = io->scaled_height;
  const uint64_t work_size = 2 * num_channels * (uint64_t)out_width;
-  int32_t* work;        // Rescaler work area.
-  const uint64_t scaled_data_size = num_channels * (uint64_t)out_width;
+  rescaler_t* work;        // Rescaler work area.
+  const uint64_t scaled_data_size = (uint64_t)out_width;
  uint32_t* scaled_data;  // Temporary storage for scaled BGRA data.
  const uint64_t memory_size = sizeof(*dec->rescaler) +
                               work_size * sizeof(*work) +
                               scaled_data_size * sizeof(*scaled_data);
-  uint8_t* memory = (uint8_t*)WebPSafeCalloc(memory_size, sizeof(*memory));
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(memory_size, sizeof(*memory));
  if (memory == NULL) {
    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
    return 0;
@ -406,13 +406,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {

  dec->rescaler = (WebPRescaler*)memory;
  memory += sizeof(*dec->rescaler);
-  work = (int32_t*)memory;
+  work = (rescaler_t*)memory;
  memory += work_size * sizeof(*work);
  scaled_data = (uint32_t*)memory;

  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels,
-                   in_width, out_width, in_height, out_height, work);
+                   out_width, out_height, 0, num_channels, work);
  return 1;
 }

@ -427,7 +426,7 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
  int num_lines_out = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
    WebPMultARGBRow(src, dst_width, 1);
    VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
    ++num_lines_out;
@ -545,7 +544,7 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
  const int dst_width = rescaler->dst_width;
  int num_lines_out = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
    WebPMultARGBRow(src, dst_width, 1);
    ConvertToYUVA(src, dst_width, y_pos, dec->output_);
    ++y_pos;
@ -900,7 +899,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
          process_func(dec, row);
        }
      }
-      if (src < src_last) {
+      if (src < src_end) {
        if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
        if (color_cache != NULL) {
          while (last_cached < src) {
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -311,7 +311,12 @@ int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);

 extern void WebPInitAlphaProcessingSSE2(void);

+static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
+    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
+
 void WebPInitAlphaProcessing(void) {
+  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  WebPMultARGBRow = MultARGBRow;
  WebPMultRow = MultRow;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
@ -326,4 +331,5 @@ void WebPInitAlphaProcessing(void) {
    }
 #endif
  }
+  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -29,16 +29,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    "cpuid\n"
    "xchg %%edi, %%ebx\n"
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "cpuid\n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
-#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
 #elif defined(WEBP_MSC_SSE2)
 #define GetCPUInfo __cpuid
@ -55,7 +57,9 @@ static WEBP_INLINE uint64_t xgetbv(void) {
    : "=a"(eax), "=d"(edx) : "c" (ecx));
  return ((uint64_t)edx << 32) | eax;
 }
-#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
 #define xgetbv() _xgetbv(0)
 #elif defined(_MSC_VER) && defined(_M_IX86)
 static WEBP_INLINE uint64_t xgetbv(void) {
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -688,7 +688,12 @@ extern void VP8DspInitSSE2(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);

+static volatile VP8CPUInfo dec_last_cpuinfo_used =
+    (VP8CPUInfo)&dec_last_cpuinfo_used;
+
 void VP8DspInit(void) {
+  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  VP8InitClipTables();

  VP8TransformWHT = TransformWHT;
@ -727,5 +732,5 @@ void VP8DspInit(void) {
    }
 #endif
  }
+  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
-
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -24,24 +24,24 @@

 // Load/Store vertical edge
 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"

 #define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"

 #if !defined(WORK_AROUND_GCC)

--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -36,14 +36,9 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif

-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif  // __clang__
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif

 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
@ -66,11 +61,15 @@ extern "C" {
 #define WEBP_ANDROID_NEON  // Android targets that might support NEON
 #endif

-#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || defined(__aarch64__)
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+     defined(__aarch64__)) && !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif

-#if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
+#if defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -692,7 +692,12 @@ extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);

+static volatile VP8CPUInfo enc_last_cpuinfo_used =
+    (VP8CPUInfo)&enc_last_cpuinfo_used;
+
 void VP8EncDspInit(void) {
+  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  VP8DspInit();  // common inverse transforms
  InitTables();

@ -737,5 +742,6 @@ void VP8EncDspInit(void) {
    }
 #endif
  }
+  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }

--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -34,26 +34,26 @@ static const int kC2 = 35468;
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
 #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
-  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
-  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
-  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
-  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
-  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
-  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
-  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
-  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
-  "sra     %[temp18],      %[temp18],      16              \n\n"            \
-  "sra     %[temp17],      %[temp17],      16              \n\n"            \
-  "sra     %[temp19],      %[temp19],      16              \n\n"            \
-  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
-  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
-  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"

 // macro for one horizontal pass in ITransformOne
 // MUL and STORE macros inlined
@ -61,59 +61,59 @@ static const int kC2 = 35468;
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
-  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
-  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
-  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
-  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
-  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
-  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                      \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
-  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
-  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
-  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
-  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
-  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
-  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
-  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255             \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
-  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                      \n\t"            \
-  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)              \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4             \n\t"            \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]        \n\t"            \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]        \n\t"            \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]        \n\t"            \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]        \n\t"            \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\t"            \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16            \n\t"            \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16            \n\t"            \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16            \n\t"            \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]  \n\t"            \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]       \n\t"            \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]       \n\t"            \
+  "lw      %[temp20],      0(%[args])                        \n\t"            \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3             \n\t"            \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3             \n\t"            \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3             \n\t"            \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3             \n\t"            \
+  "lbu     %[temp16],      " #A "(%[temp20])                 \n\t"            \
+  "lbu     %[temp17],      " #B "(%[temp20])                 \n\t"            \
+  "lbu     %[temp18],      " #C "(%[temp20])                 \n\t"            \
+  "lbu     %[temp19],      " #D "(%[temp20])                 \n\t"            \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]   \n\t"            \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]   \n\t"            \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]   \n\t"            \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]  \n\t"            \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero           \n\t"            \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero           \n\t"            \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero           \n\t"            \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero           \n\t"            \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]       \n\t"            \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]       \n\t"            \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]       \n\t"            \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]       \n\t"            \
+  "addiu   %[temp20],      $zero,          255               \n\t"            \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]       \n\t"            \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]       \n\t"            \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]       \n\t"            \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]       \n\t"            \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]       \n\t"            \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]       \n\t"            \
+  "lw      %[temp16],      8(%[args])                        \n\t"            \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]       \n\t"            \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]       \n\t"            \
+  "sb      %[" #TEMP0 "],    " #A "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP4 "],    " #B "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP8 "],    " #C "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP12 "],   " #D "(%[temp16])               \n\t"

 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@ -164,9 +164,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 #define QUANTIZE_ONE(J, K, N)                                               \
-  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
  "sra          %[sign],        %[temp0],           15              \n\t"   \
  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
@ -175,9 +175,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "addiu        %[temp5],       $zero,              0               \n\t"   \
  "addiu        %[level],       $zero,              0               \n\t"   \
  "beqz         %[temp4],       2f                                  \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
-  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
  "sra          %[level],       %[level],           17              \n\t"   \
@ -187,8 +187,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "subu         %[level],       %[level],           %[sign]         \n\t"   \
  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
 "2:                                                                 \n\t"   \
-  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
-  "sh           %[level],       "#N"(%[pout])                       \n\t"
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"

 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
@ -249,14 +249,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
 #define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
-  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
-  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
-  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
-  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
-  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
-  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
-  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
+  "lbu    %[temp0],  " #A "(%[a])            \n\t"                \
+  "lbu    %[temp1],  " #B "(%[a])            \n\t"                \
+  "lbu    %[temp2],  " #C "(%[a])            \n\t"                \
+  "lbu    %[temp3],  " #D "(%[a])            \n\t"                \
+  "lbu    %[temp4],  " #A "(%[b])            \n\t"                \
+  "lbu    %[temp5],  " #B "(%[b])            \n\t"                \
+  "lbu    %[temp6],  " #C "(%[b])            \n\t"                \
+  "lbu    %[temp7],  " #D "(%[b])            \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
@ -273,14 +273,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
-  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
-  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
-  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
-  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
-  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
-  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
-  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+  "sw     %[temp7],  " #E "(%[tmp])          \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])          \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])          \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])          \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])         \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])         \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])         \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])         \n\t"

 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@ -295,10 +295,10 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // A1..D1 - offsets in bytes to load second results from tmp buffer
 // E..H - offsets in bytes to load from w buffer
 #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
-  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
-  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
-  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
-  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -319,18 +319,18 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
-  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
-  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
-  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
-  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
  "madd   %[temp4],  %[temp3]                \n\t"                \
  "madd   %[temp5],  %[temp1]                \n\t"                \
  "madd   %[temp6],  %[temp0]                \n\t"                \
  "madd   %[temp7],  %[temp8]                \n\t"                \
-  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
-  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
-  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
-  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -407,71 +407,71 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
-  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
-  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
-  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
-  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
-  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
-  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
-  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
-  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
-  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
-  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3)   \
+  "lw     %[" #TEMP1 "],  0(%[args])                     \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                     \n\t"    \
+  "lbu    %[temp16],    " #A "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp17],    " #A "(%[" #TEMP2 "])            \n\t"    \
+  "lbu    %[temp18],    " #B "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp19],    " #B "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]          \n\t"    \
+  "lbu    %[temp16],    " #C "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp17],    " #C "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]        \n\t"    \
+  "lbu    %[temp18],    " #D "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp19],    " #D "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]        \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]        \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]    \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]  \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]         \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]         \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]         \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]         \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]      \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]        \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3              \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812               \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]          \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]          \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                \n\t"

 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to store to out buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
-  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
-  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
-  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
-  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
-  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
-  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
-  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
-  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
-  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
-  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
-  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
-  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
-  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -622,14 +622,14 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
 }

 #define GET_SSE_INNER(A, B, C, D)                               \
-  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
-  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
-  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
-  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
-  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
-  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
-  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
-  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -1012,8 +1012,10 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  const int16x8_t out0 = Quantize(in, mtx, 0);
  const int16x8_t out1 = Quantize(in, mtx, 8);
  uint8x8x4_t shuffles;
-  // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.
-#if defined(__APPLE__) && defined(__aarch64__)
+  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+  // non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
  uint8x16x2_t all_out;
  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
  INIT_VECTOR4(shuffles,
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -1590,7 +1590,12 @@ extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPS32(void);

+static volatile VP8CPUInfo lossless_last_cpuinfo_used =
+    (VP8CPUInfo)&lossless_last_cpuinfo_used;
+
 void VP8LDspInit(void) {
+  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));

  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
@ -1634,6 +1639,7 @@ void VP8LDspInit(void) {
    }
 #endif
  }
+  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/src/dsp/lossless_mips32.c
+++ b/src/dsp/lossless_mips32.c
@ -285,28 +285,28 @@ static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
 // literal_ and successive histograms could be unaligned
 // so we must use ulw and usw
 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
-    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
-    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
-    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
-    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
-    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
-    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
-    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
-    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
+    "ulw    %[temp0], " #A "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp1], " #B "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp2], " #C "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp3], " #D "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp4], " #A "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp5], " #B "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp6], " #C "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp7], " #D "(%[" #P1 "])    \n\t"       \
    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
-    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
-  ".if "#E" == 1                            \n\t"       \
-    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
+    "addiu  %[" #P0 "],  %[" #P0 "],  16    \n\t"       \
+  ".if " #E " == 1                          \n\t"       \
+    "addiu  %[" #P1 "],  %[" #P1 "],  16    \n\t"       \
  ".endif                                   \n\t"       \
-    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
-    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
-    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
-    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
-    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
-    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
+    "usw    %[temp4], " #A "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp5], " #B "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp6], " #C "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp7], " #D "(%[" #P2 "])    \n\t"       \
+    "addiu  %[" #P2 "], %[" #P2 "],   16    \n\t"       \
+    "bne    %[" #P0 "], %[LoopEnd], 1b      \n\t"       \
    ".set   pop                             \n\t"       \

 #define ASM_END_COMMON_0                                \
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -259,20 +259,45 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 //------------------------------------------------------------------------------
 // Subtract-Green Transform

-// vtbl? are unavailable in iOS/arm64 builds.
-#if !defined(__aarch64__)
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif

-// 255 = byte will be zero'd
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };

+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens =
-        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
-                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
@ -281,19 +306,21 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {

 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens =
-        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
-                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
 }

-#endif   // !__aarch64__
+#undef USE_VTBLQ

 #endif   // USE_INTRINSICS

@ -320,11 +347,9 @@ void VP8LDspInitNEON(void) {
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;

-#if !defined(__aarch64__)
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
 #endif
-#endif

 #endif   // WEBP_USE_NEON
 }
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -189,7 +189,12 @@ const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
 extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);

+static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
+
 void WebPInitUpsamplers(void) {
+  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
+
 #ifdef FANCY_UPSAMPLING
  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
@ -217,6 +222,7 @@ void WebPInitUpsamplers(void) {
 #endif
  }
 #endif  // FANCY_UPSAMPLING
+  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -123,7 +123,12 @@ WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 extern void WebPInitSamplersSSE2(void);
 extern void WebPInitSamplersMIPS32(void);

+static volatile VP8CPUInfo yuv_last_cpuinfo_used =
+    (VP8CPUInfo)&yuv_last_cpuinfo_used;
+
 void WebPInitSamplers(void) {
+  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
@ -149,6 +154,7 @@ void WebPInitSamplers(void) {
    }
 #endif  // WEBP_USE_MIPS32
  }
+  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //-----------------------------------------------------------------------------
--- a/src/enc/histogram.c
+++ b/src/enc/histogram.c
@ -20,6 +20,9 @@
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"

+#define ALIGN_CST 15
+#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
+
 #define MAX_COST 1.e38

 // Number of partitions for the three dominant (literal, red and blue) symbol
@ -101,9 +104,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
  int i;
  VP8LHistogramSet* set;
-  const size_t total_size = sizeof(*set)
-                            + sizeof(*set->histograms) * size
-                            + (size_t)VP8LGetHistogramSize(cache_bits) * size;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  const size_t total_size =
+      sizeof(*set) + size * (sizeof(*set->histograms) + histo_size + ALIGN_CST);
  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
  if (memory == NULL) return NULL;

@ -114,12 +117,12 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
  set->max_size = size;
  set->size = size;
  for (i = 0; i < size; ++i) {
+    memory = (uint8_t*)DO_ALIGN(memory);
    set->histograms[i] = (VP8LHistogram*)memory;
    // literal_ won't necessary be aligned.
    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
    VP8LHistogramInit(set->histograms[i], cache_bits);
-    // There's no padding/alignment between successive histograms.
-    memory += VP8LGetHistogramSize(cache_bits);
+    memory += histo_size;
  }
  return set;
 }
--- a/src/enc/picture_rescale.c
+++ b/src/enc/picture_rescale.c
@ -175,17 +175,13 @@ static void RescalePlane(const uint8_t* src,
                         int src_width, int src_height, int src_stride,
                         uint8_t* dst,
                         int dst_width, int dst_height, int dst_stride,
-                         int32_t* const work,
+                         rescaler_t* const work,
                         int num_channels) {
  WebPRescaler rescaler;
  int y = 0;
  WebPRescalerInit(&rescaler, src_width, src_height,
                   dst, dst_width, dst_height, dst_stride,
-                   num_channels,
-                   src_width, dst_width,
-                   src_height, dst_height,
-                   work);
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+                   num_channels, work);
  while (y < src_height) {
    y += WebPRescalerImport(&rescaler, src_height - y,
                            src + y * src_stride, src_stride);
@ -209,7 +205,7 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  WebPPicture tmp;
  int prev_width, prev_height;
-  int32_t* work;
+  rescaler_t* work;

  if (pic == NULL) return 0;
  prev_width = pic->width;
@ -231,7 +227,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  if (!WebPPictureAlloc(&tmp)) return 0;

  if (!pic->use_argb) {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
@ -259,7 +255,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                 tmp.v,
                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
  } else {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@ -30,7 +30,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 2
+#define ENC_REV_VERSION 4

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@ -1081,6 +1081,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
    int y;
    err = AllocateTransformBuffer(enc, width, height);
    if (err != VP8_ENC_OK) goto Error;
+    assert(enc->argb_ != NULL);
    for (y = 0; y < height; ++y) {
      memcpy(enc->argb_ + y * width,
             picture->argb + y * picture->argb_stride,
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@ -326,7 +326,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {

  if (!config->lossless) {
    VP8Encoder* enc = NULL;
-    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+    if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
      // Make sure we have YUVA samples.
      if (config->preprocessing & 4) {
 #if WEBP_ENCODER_ABI_VERSION > 0x0204
--- a/src/utils/endian_inl.h
+++ b/src/utils/endian_inl.h
@ -35,15 +35,15 @@
 #endif

 #if !defined(HAVE_CONFIG_H)
-// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
-#if LOCAL_GCC_PREREQ(4,3) || LOCAL_CLANG_PREREQ(3,3)
-#define HAVE_BUILTIN_BSWAP32
-#define HAVE_BUILTIN_BSWAP64
-#endif
-// clang-3.3 and gcc-4.8 have a builtin function for swap16
-#if LOCAL_GCC_PREREQ(4,8) || LOCAL_CLANG_PREREQ(3,3)
+#if LOCAL_GCC_PREREQ(4,8) || __has_builtin(__builtin_bswap16)
 #define HAVE_BUILTIN_BSWAP16
 #endif
+#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
 #endif  // !HAVE_CONFIG_H

 static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
--- a/src/utils/rescaler.c
+++ b/src/utils/rescaler.c
@ -13,77 +13,192 @@

 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 #include "./rescaler.h"
 #include "../dsp/dsp.h"

 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow

-void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
-                              const uint8_t* const src, int channel) = NULL;
-void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out) = NULL;
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(WebPRescaler* const wrk,
+                                          const uint8_t* src);
+static WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+static WebPRescalerImportRowFunc WebPRescalerImportRowShrink;

-#define RFIX 30
-#define MULT_FIX(x, y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(WebPRescaler* const wrk);
+static WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+static WebPRescalerExportRowFunc WebPRescalerExportRowShrink;

-static void ImportRowC(WebPRescaler* const wrk,
-                       const uint8_t* const src, int channel) {
+#define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
+#define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
+#define WEBP_RESCALER_FRAC(x, y) \
+    ((uint32_t)(((uint64_t)(x) << WEBP_RESCALER_RFIX) / (y)))
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+static void ImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
-  int x_in = channel;
-  int x_out;
-  int accum = 0;
-  if (!wrk->x_expand) {
-    int sum = 0;
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-      accum += wrk->x_add;
-      for (; accum > 0; accum -= wrk->x_sub) {
-        sum += src[x_in];
-        x_in += x_stride;
-      }
-      {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in];
-        const int32_t frac = base * (-accum);
-        x_in += x_stride;
-        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
-        // fresh fractional start for next pixel
-        sum = (int)MULT_FIX(frac, wrk->fx_scale);
-      }
-    }
-  } else {        // simple bilinear interpolation
-    int left = src[channel], right = src[channel];
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    // simple bilinear interpolation
+    int accum = wrk->x_add;
+    int left = src[x_in];
+    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    x_in += x_stride;
+    while (1) {
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      x_out += x_stride;
+      if (x_out >= x_out_max) break;
+      accum -= wrk->x_sub;
      if (accum < 0) {
        left = right;
        x_in += x_stride;
+        assert(x_in < wrk->src_width * x_stride);
        right = src[x_in];
        accum += wrk->x_add;
      }
-      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-      accum -= wrk->x_sub;
    }
-  }
-  // Accumulate the contribution of the new row.
-  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-    wrk->irow[x_out] += wrk->frow[x_out];
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
  }
 }

-static void ExportRowC(WebPRescaler* const wrk, int x_out) {
-  if (wrk->y_accum <= 0) {
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
-    for (; x_out < x_out_max; ++x_out) {
-      const int frac = (int)MULT_FIX(frow[x_out], yscale);
+static void ImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(!wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    uint32_t sum = 0;
+    int accum = 0;
+    while (x_out < x_out_max) {
+      uint32_t base = 0;
+      accum += wrk->x_add;
+      while (accum > 0) {
+        accum -= wrk->x_sub;
+        assert(x_in < wrk->src_width * x_stride);
+        base = src[x_in];
+        sum += base;
+        x_in += x_stride;
+      }
+      {        // Emit next horizontal pixel.
+        const rescaler_t frac = base * (-accum);
+        wrk->frow[x_out] = sum * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = (int)MULT_FIX(frac, wrk->fx_scale);
+      }
+      x_out += x_stride;
+    }
+    assert(accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpandC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static void ExportRowShrinkC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
-      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
      irow[x_out] = frac;   // new fractional start
    }
+  } else {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Main entry calls
+
+void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
+  assert(!WebPRescalerInputDone(wrk));
+  if (!wrk->x_expand) {
+    WebPRescalerImportRowShrink(wrk, src);
+  } else {
+    WebPRescalerImportRowExpand(wrk, src);
+  }
+}
+
+void WebPRescalerExportRow(WebPRescaler* const wrk) {
+  if (wrk->y_accum <= 0) {
+    assert(!WebPRescalerOutputDone(wrk));
+    if (wrk->y_expand) {
+      WebPRescalerExportRowExpand(wrk);
+    } else if (wrk->fxy_scale) {
+      WebPRescalerExportRowShrink(wrk);
+    } else {  // very special case for src = dst = 1x1
+      int i;
+      assert(wrk->src_width == 1 && wrk->dst_width <= 2);
+      assert(wrk->src_height == 1 && wrk->dst_height == 1);
+      for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
+        wrk->dst[i] = wrk->irow[i];
+        wrk->irow[i] = 0;
+      }
+    }
    wrk->y_accum += wrk->y_add;
    wrk->dst += wrk->dst_stride;
+    ++wrk->dst_y;
  }
 }

@ -92,23 +207,25 @@ static void ExportRowC(WebPRescaler* const wrk, int x_out) {

 #if defined(WEBP_USE_MIPS32)

-static void ImportRowMIPS(WebPRescaler* const wrk,
-                          const uint8_t* const src, int channel) {
+static void ImportRowShrinkMIPS(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int fx_scale = wrk->fx_scale;
  const int x_add = wrk->x_add;
  const int x_sub = wrk->x_sub;
-  int* frow = wrk->frow + channel;
-  int* irow = wrk->irow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp1, temp2, temp3;
-  int base, frac, sum;
-  int accum, accum1;
  const int x_stride1 = x_stride << 2;
-  int loop_c = x_out_max - channel;
+  int channel;
+  assert(!wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max - channel;

-  if (!wrk->x_expand) {
    __asm__ volatile (
      "li     %[temp1],   0x8000                    \n\t"
      "li     %[temp2],   0x10000                   \n\t"
@ -116,179 +233,295 @@ static void ImportRowMIPS(WebPRescaler* const wrk,
      "li     %[accum],   0                         \n\t"
    "1:                                             \n\t"
      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
      "blez   %[accum],   3f                        \n\t"
    "2:                                             \n\t"
-      "lbu    %[temp3],   0(%[src1])                \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
-      "addu   %[sum],     %[sum],     %[temp3]      \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
      "bgtz   %[accum],   2b                        \n\t"
    "3:                                             \n\t"
-      "lbu    %[base],    0(%[src1])                \n\t"
-      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
      "negu   %[accum1],  %[accum]                  \n\t"
      "mul    %[frac],    %[base],    %[accum1]     \n\t"
-      "addu   %[temp3],   %[sum],     %[base]       \n\t"
-      "mul    %[temp3],   %[temp3],   %[x_sub]      \n\t"
-      "lw     %[base],    0(%[irow])                \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "sll    %[accum1],  %[frac],    2             \n\t"
      "mult   %[temp1],   %[temp2]                  \n\t"
-      "madd   %[accum1],  %[fx_scale]               \n\t"
+      "maddu  %[frac],    %[fx_scale]               \n\t"
      "mfhi   %[sum]                                \n\t"
      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
      "sw     %[temp3],   0(%[frow])                \n\t"
-      "add    %[base],    %[base],    %[temp3]      \n\t"
-      "sw     %[base],    0(%[irow])                \n\t"
-      "addu   %[irow],    %[irow],    %[x_stride1]  \n\t"
      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
      "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+r"(frow), [accum1]"=&r"(accum1),
+        [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+        [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+        [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+      : "memory", "hi", "lo"
+    );
+    assert(accum == 0);
+  }
+}

-      : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3),
-        [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac),
-        [frow] "+r" (frow), [irow] "+r" (irow), [accum1] "=&r" (accum1),
-        [temp2] "=&r" (temp2), [temp1] "=&r" (temp1)
-      : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale),
-        [x_sub] "r" (x_sub), [x_add] "r" (x_add),
-        [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1)
+static void ImportRowExpandMIPS(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int src_width = wrk->src_width;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+        [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+        [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+      : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+        [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+        [x_out_max]"r"(x_out_max)
+      : "memory", "hi", "lo"
+    );
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpandMIPS(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fy_scale;
+  const int temp6 = x_out_max << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "lw       %[temp1],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[A],        %[temp0]                   \n\t"
+      "maddu    %[B],        %[temp1]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp5],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+static void ExportRowShrinkMIPS(WebPRescaler* const wrk) {
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fxy_scale;
+  const int temp6 = x_out_max << 2;
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "maddu    %[temp0],    %[yscale]                  \n\t"
+      "mfhi     %[temp1]                                \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       %[temp1],    -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
      : "memory", "hi", "lo"
    );
  } else {
    __asm__ volatile (
-      "lbu    %[temp1],   0(%[src1])                \n\t"
-      "move   %[temp2],   %[temp1]                  \n\t"
-      "li     %[accum],   0                         \n\t"
-    "1:                                             \n\t"
-      "bgez   %[accum],   2f                        \n\t"
-      "move   %[temp2],   %[temp1]                  \n\t"
-      "addu   %[src1],    %[x_stride]               \n\t"
-      "lbu    %[temp1],   0(%[src1])                \n\t"
-      "addu   %[accum],   %[x_add]                  \n\t"
-    "2:                                             \n\t"
-      "subu   %[temp3],   %[temp2],   %[temp1]      \n\t"
-      "mul    %[temp3],   %[temp3],   %[accum]      \n\t"
-      "mul    %[base],    %[temp1],   %[x_add]      \n\t"
-      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
-      "lw     %[frac],    0(%[irow])                \n\t"
-      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "addu   %[temp3],   %[base],    %[temp3]      \n\t"
-      "sw     %[temp3],   0(%[frow])                \n\t"
-      "addu   %[frow],    %[x_stride1]              \n\t"
-      "addu   %[frac],    %[temp3]                  \n\t"
-      "sw     %[frac],    0(%[irow])                \n\t"
-      "addu   %[irow],    %[x_stride1]              \n\t"
-      "bgtz   %[loop_c],  1b                        \n\t"
-
-      : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
-        [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [base] "=&r" (base),
-        [frac] "=&r" (frac), [frow] "+r" (frow), [irow] "+r" (irow)
-      : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
-        [x_stride1] "r" (x_stride1), [loop_c] "r" (loop_c)
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       $zero,       -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[irow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
      : "memory", "hi", "lo"
    );
  }
 }

-static void ExportRowMIPS(WebPRescaler* const wrk, int x_out) {
-  if (wrk->y_accum <= 0) {
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
-    // if wrk->fxy_scale can fit into 32 bits use optimized code,
-    // otherwise use C code
-    if ((wrk->fxy_scale >> 32) == 0) {
-      int temp0, temp1, temp3, temp4, temp5, temp6, temp7, loop_end;
-      const int temp2 = (int)(wrk->fxy_scale);
-      const int temp8 = x_out_max << 2;
-      uint8_t* dst_t = (uint8_t*)dst;
-      int32_t* irow_t = (int32_t*)irow;
-      const int32_t* frow_t = (const int32_t*)frow;
-
-      __asm__ volatile(
-        "addiu    %[temp6],    $zero,       -256          \n\t"
-        "addiu    %[temp7],    $zero,       255           \n\t"
-        "li       %[temp3],    0x10000                    \n\t"
-        "li       %[temp4],    0x8000                     \n\t"
-        "addu     %[loop_end], %[frow_t],   %[temp8]      \n\t"
-      "1:                                                 \n\t"
-        "lw       %[temp0],    0(%[frow_t])               \n\t"
-        "mult     %[temp3],    %[temp4]                   \n\t"
-        "addiu    %[frow_t],   %[frow_t],   4             \n\t"
-        "sll      %[temp0],    %[temp0],    2             \n\t"
-        "madd     %[temp0],    %[yscale]                  \n\t"
-        "mfhi     %[temp1]                                \n\t"
-        "lw       %[temp0],    0(%[irow_t])               \n\t"
-        "addiu    %[dst_t],    %[dst_t],    1             \n\t"
-        "addiu    %[irow_t],   %[irow_t],   4             \n\t"
-        "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
-        "mult     %[temp3],    %[temp4]                   \n\t"
-        "sll      %[temp0],    %[temp0],    2             \n\t"
-        "madd     %[temp0],    %[temp2]                   \n\t"
-        "mfhi     %[temp5]                                \n\t"
-        "sw       %[temp1],    -4(%[irow_t])              \n\t"
-        "and      %[temp0],    %[temp5],    %[temp6]      \n\t"
-        "slti     %[temp1],    %[temp5],    0             \n\t"
-        "beqz     %[temp0],    2f                         \n\t"
-        "xor      %[temp5],    %[temp5],    %[temp5]      \n\t"
-        "movz     %[temp5],    %[temp7],    %[temp1]      \n\t"
-      "2:                                                 \n\t"
-        "sb       %[temp5],    -1(%[dst_t])               \n\t"
-        "bne      %[frow_t],   %[loop_end], 1b            \n\t"
-
-        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
-          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
-          [temp7]"=&r"(temp7), [frow_t]"+r"(frow_t), [irow_t]"+r"(irow_t),
-          [dst_t]"+r"(dst_t), [loop_end]"=&r"(loop_end)
-        : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp8]"r"(temp8)
-        : "memory", "hi", "lo"
-      );
-      wrk->y_accum += wrk->y_add;
-      wrk->dst += wrk->dst_stride;
-    } else {
-      ExportRowC(wrk, x_out);
-    }
-  }
-}
 #endif   // WEBP_USE_MIPS32

 //------------------------------------------------------------------------------

 void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst, int dst_width, int dst_height,
-                      int dst_stride, int num_channels, int x_add, int x_sub,
-                      int y_add, int y_sub, int32_t* const work) {
+                      uint8_t* const dst,
+                      int dst_width, int dst_height, int dst_stride,
+                      int num_channels, rescaler_t* const work) {
+  const int x_add = src_width, x_sub = dst_width;
+  const int y_add = src_height, y_sub = dst_height;
  wrk->x_expand = (src_width < dst_width);
+  wrk->y_expand = (src_height < dst_height);
  wrk->src_width = src_width;
  wrk->src_height = src_height;
  wrk->dst_width = dst_width;
  wrk->dst_height = dst_height;
+  wrk->src_y = 0;
+  wrk->dst_y = 0;
  wrk->dst = dst;
  wrk->dst_stride = dst_stride;
  wrk->num_channels = num_channels;
+
  // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  if (!wrk->x_expand) {  // fx_scale is not used otherwise
+    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  }
+  // vertical scaling parameters
+  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
+  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
+  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
+  if (!wrk->y_expand) {
+    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    const uint64_t ratio =
+        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    if (ratio != (uint32_t)ratio) {
+      // We can't represent the ratio with the current fixed-point precision.
+      // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
+      wrk->fxy_scale = 0;
+    } else {
+      wrk->fxy_scale = (uint32_t)ratio;
+    }
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+  } else {
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
+    // wrk->fxy_scale is unused here.
+  }
  wrk->irow = work;
  wrk->frow = work + num_channels * dst_width;
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));

-  if (WebPRescalerImportRow == NULL) {
-    WebPRescalerImportRow = ImportRowC;
-    WebPRescalerExportRow = ExportRowC;
+  if (WebPRescalerImportRowExpand == NULL) {
+    WebPRescalerImportRowExpand = ImportRowExpandC;
+    WebPRescalerImportRowShrink = ImportRowShrinkC;
+    WebPRescalerExportRowExpand = ExportRowExpandC;
+    WebPRescalerExportRowShrink = ExportRowShrinkC;
    if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_MIPS32)
      if (VP8GetCPUInfo(kMIPS32)) {
-        WebPRescalerImportRow = ImportRowMIPS;
-        WebPRescalerExportRow = ExportRowMIPS;
+        WebPRescalerImportRowExpand = ImportRowExpandMIPS;
+        WebPRescalerImportRowShrink = ImportRowShrinkMIPS;
+        WebPRescalerExportRowExpand = ExportRowExpandMIPS;
+        WebPRescalerExportRowShrink = ExportRowShrinkMIPS;
      }
 #endif
    }
@ -296,7 +529,10 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
 }

 #undef MULT_FIX
-#undef RFIX
+#undef WEBP_RESCALER_RFIX
+#undef WEBP_RESCALER_ONE
+#undef WEBP_RESCALER_FRAC
+#undef ROUNDER

 //------------------------------------------------------------------------------
 // all-in-one calls
@ -309,11 +545,20 @@ int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                       const uint8_t* src, int src_stride) {
  int total_imported = 0;
-  while (total_imported < num_lines && wrk->y_accum > 0) {
-    int channel;
-    for (channel = 0; channel < wrk->num_channels; ++channel) {
-      WebPRescalerImportRow(wrk, src, channel);
+  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
+    if (wrk->y_expand) {
+      rescaler_t* const tmp = wrk->irow;
+      wrk->irow = wrk->frow;
+      wrk->frow = tmp;
    }
+    WebPRescalerImportRow(wrk, src);
+    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+      int x;
+      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
+        wrk->irow[x] += wrk->frow[x];
+      }
+    }
+    ++wrk->src_y;
    src += src_stride;
    ++total_imported;
    wrk->y_accum -= wrk->y_sub;
@ -324,7 +569,7 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
 int WebPRescalerExport(WebPRescaler* const rescaler) {
  int total_exported = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
    ++total_exported;
  }
  return total_exported;
--- a/src/utils/rescaler.h
+++ b/src/utils/rescaler.h
@ -21,20 +21,23 @@ extern "C" {
 #include "../webp/types.h"

 // Structure used for on-the-fly rescaling
+typedef uint32_t rescaler_t;   // type for side-buffer
 typedef struct {
  int x_expand;               // true if we're expanding in the x direction
+  int y_expand;               // true if we're expanding in the y direction
  int num_channels;           // bytes to jump between pixels
-  int fy_scale, fx_scale;     // fixed-point scaling factor
-  int64_t fxy_scale;          // ''
-  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  uint32_t fx_scale;          // fixed-point scaling factors
+  uint32_t fy_scale;          // ''
+  uint32_t fxy_scale;         // ''
  int y_accum;                // vertical accumulator
-  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
-  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int y_add, y_sub;           // vertical increments
+  int x_add, x_sub;           // horizontal increments
  int src_width, src_height;  // source dimensions
  int dst_width, dst_height;  // destination dimensions
+  int src_y, dst_y;           // row counters for input and output
  uint8_t* dst;
  int dst_stride;
-  int32_t* irow, *frow;       // work buffer
+  rescaler_t* irow, *frow;    // work buffer
 } WebPRescaler;

 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
@ -43,9 +46,7 @@ void WebPRescalerInit(WebPRescaler* const rescaler,
                      uint8_t* const dst,
                      int dst_width, int dst_height, int dst_stride,
                      int num_channels,
-                      int x_add, int x_sub,
-                      int y_add, int y_sub,
-                      int32_t* const work);
+                      rescaler_t* const work);

 // Returns the number of input lines needed next to produce one output line,
 // considering that the maximum available input lines are 'max_num_lines'.
@ -57,21 +58,29 @@ int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
 int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
                       const uint8_t* src, int src_stride);

-// Import a row of data and save its contribution in the rescaler.
-// 'channel' denotes the channel number to be imported.
-extern void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
-                                     const uint8_t* const src, int channel);
-// Export one row (starting at x_out position) from rescaler.
-extern void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out);
-
-// Return true if there is pending output rows ready.
-static WEBP_INLINE
-int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
-  return (rescaler->y_accum <= 0);
-}
-
 // Export as many rows as possible. Return the numbers of rows written.
 int WebPRescalerExport(WebPRescaler* const rescaler);
+void WebPRescalerImportRow(WebPRescaler* const wrk,
+                           const uint8_t* src);
+// Export one row (starting at x_out position) from rescaler.
+void WebPRescalerExportRow(WebPRescaler* const wrk);
+
+// Return true if input is finished
+static WEBP_INLINE
+int WebPRescalerInputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->src_y >= rescaler->src_height);
+}
+// Return true if output is finished
+static WEBP_INLINE
+int WebPRescalerOutputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->dst_y >= rescaler->dst_height);
+}
+
+// Return true if there are pending output rows ready.
+static WEBP_INLINE
+int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
+  return !WebPRescalerOutputDone(rescaler) && (rescaler->y_accum <= 0);
+}

 //------------------------------------------------------------------------------

--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@ -90,7 +90,7 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
 #pragma intrinsic(_BitScanReverse)

 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  uint32_t first_set_bit;
+  unsigned long first_set_bit;
  _BitScanReverse(&first_set_bit, n);
  return first_set_bit;
 }
--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@ -419,7 +419,9 @@ WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
 WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture);

 // Rescale a picture to new dimension width x height.
-// Now gamma correction is applied.
+// If either 'width' or 'height' (but not both) is 0 the corresponding
+// dimension will be calculated preserving the aspect ratio.
+// No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
 WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height);

--- a/src/webp/types.h
+++ b/src/webp/types.h
@ -18,10 +18,11 @@

 #ifndef _MSC_VER
 #include <inttypes.h>
-#ifdef __STRICT_ANSI__
-#define WEBP_INLINE
-#else  /* __STRICT_ANSI__ */
+#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
 #define WEBP_INLINE inline
+#else
+#define WEBP_INLINE
 #endif
 #else
 typedef signed   char int8_t;
Author	SHA1	Message	Date
Pascal Massimino	46e18c0a25	vwebp: fix incorrect clipping w/NO_BLEND when the previous frame does not specify dispose to background only the current frame's rectangle should be cleared related to bug #245 (cherry picked from commit `469ba2cdfd`) Change-Id: I2fc4f5be99057e0bf87d8fedec57b06859b070bd	2015-10-23 13:10:17 -07:00
James Zern	fcfde90b9c	update issue tracker url code.google.com -> bugs.chromium.org (cherry picked from commit `4b9186b2eb`) Change-Id: I0dc99a85c29657415401160df3c7dd0423f96457	2015-10-20 22:44:52 -07:00
James Zern	8c3fb330e5	update AUTHORS missed in 0.4.3 Change-Id: I2b488307dfec592264467847adcbc3de0a6c83f3	2015-10-19 15:41:24 -07:00
James Zern	808d4a686e	update NEWS Change-Id: I301be57ed6d925164f3827a79c6215ab79f120c7	2015-10-19 15:41:24 -07:00
James Zern	62864042c0	bump version to 0.4.4 libwebp{,decoder} - 0.4.4 libwebp libtool - 5.4.0 libwebpdecoder libtool - 1.4.0 mux/demux - 0.2.2 (unchanged) libtool - 1.2.0 (unchanged) Change-Id: I7d421dc47ad4d25a17450ce1b04562c5d58c596b	2015-10-19 15:41:23 -07:00
James Zern	b8b314ab39	doc/webp-container-spec: update repo browser link gerrit.chromium.org is deprecated, use chromium.googlesource.com. (cherry picked from commit `f0486968ba`) Change-Id: Iaa6d6d18798dbd8cce908988287387f5cb8e8e64	2015-10-19 15:41:23 -07:00
Pascal Massimino	c3953e37c9	fix typo: constitutes -> constitute (cherry picked from commit `5fe1fe37a5`) Change-Id: I5b20ef41f4a810e11a4499b46b5e7dc93247beed	2015-10-19 15:41:23 -07:00
Johann	cd377e291c	Use __has_builtin to check clang support Older versions of Xcode with clang reporting versions 4.[012] and 5.0 did not include support for __builtin_bswap16. Checking in this manner avoids using brittle version checks. Matches a change to libvpx: https://chromium-review.googlesource.com/305573 to fix: https://code.google.com/p/webm/issues/detail?id=1082 (cherry picked from commit `d26d9def80`) Change-Id: I23ea466ee1b53b12cd3fb45f65a2186c8dda95a1	2015-10-19 15:41:22 -07:00
James Zern	e2e89806f7	wicdec: fix alpha detection w/64bpp BGRA/RGBA (cherry picked from commit `badfcbaa1e`) Change-Id: Ia712cf736e490d482a52b63d8e2816d0b7035cd0	2015-10-19 15:41:22 -07:00
James Zern	5c3fe77dd8	iosbuild: fix linking with Xcode 7 / iOS SDK 9 -fembed-bitcode is the default, a framework built without this flag will fail to link against an application using it. BUG=267 (cherry picked from commit `db1321a6a2`) Change-Id: I83461cb058b1866ac99b3f0bdfa890933e88ed26	2015-10-19 15:41:22 -07:00
James Zern	f9f5498b6c	VP8LAllocateHistogramSet: align histogram[] entries fixes issue #262: a SIGBUS when accessing a misaligned double in VP8LHistogram (cherry picked from commit `cd82440ec7`) Change-Id: Ic78cc5366d7e43d892c375b6a69dce2379db931b	2015-10-19 15:41:21 -07:00
Pascal Massimino	3026db2ee5	Loosen the buffer size checks for Y/U/V/A too. (follow-up to `15ca5014`) (cherry picked from commit `017f8cccec`) Change-Id: Ia122e96f616bd6317c24b69c9534cb7919b8a4a4	2015-10-19 15:41:21 -07:00
Pascal Massimino	d089362d07	loosen the padding check on buffer size Strictly speaking, the last (or first) row doesn't require padding. cf https://code.google.com/p/webp/issues/detail?id=258 (cherry picked from commit `15ca5014f1`) Change-Id: Ie9ec8eb776fec1f5cea4cf9e21e81901fd79bf33	2015-10-19 15:41:21 -07:00
James Zern	53d22c5b3e	dec_neon: add whitespace around stringizing operator prevents unintentional side-effects (though unlikely in this case) with future compilers, cf: `eebaf97` dsp/mips: add whitespace around stringizing operator (cherry picked from commit `d623a8706f`) Change-Id: I0537091fcc97b4f54d0a156c3c83a28c51456b17	2015-10-19 15:41:21 -07:00
James Zern	8bcc4d4523	dsp/mips: add whitespace around stringizing operator fixes compile with gcc 5.1 BUG=259 (cherry picked from commit `eebaf97f5a`) Change-Id: Ideb39c6290ab8569b1b6cc835bea11c822d0286c	2015-10-19 15:41:20 -07:00
Urvang Joshi	d49c44f450	Container spec: clarify ordering of ALPH chunk. Reported by user: https://code.google.com/p/webp/issues/detail?id=255 (cherry picked from commit `585d93dbba`) Change-Id: I9c027ea828d5a367b317744fad7607a16ed52fa5	2015-10-19 15:41:20 -07:00
James Zern	382de22c84	msvc: fix pointer type warning in BitsLog2Floor _BitScanReverse() takes an unsigned long* http://msdn.microsoft.com/en-us/library/fbxyd7zd.aspx fixes: C4057: 'function': 'unsigned long ' differs in indirection to slightly different base types from 'uint32_t ' fixes issue #253 (cherry picked from commit `0250dfcc19`) Change-Id: I0101ef7be18c7ed188b35e9b17e7f71290953786	2015-10-19 15:41:20 -07:00
Urvang Joshi	84ecd9d85c	FlattenSimilarBlocks should only be tried when blending is possible. This is because, FlattenSimilarBlocks() replaces some opaque pixels by transparent ones. This results in an equivalent output only if blending is turned on for the current frame. (cherry picked from commit `5cccdadf2e`) Change-Id: I05612c952fdbd4b3a6e0ac9f3a7d49822f0cfb9b	2015-10-19 15:41:19 -07:00
James Zern	f55ebbba82	backport rescaler fix backported from: `7df9389`, `5ff0079` Change-Id: I11b4d97c3c483431528be9ccbd9895baac8c6a63	2015-10-19 15:41:13 -07:00
James Zern	2ff633c938	fix mips2 build target tested with mips1 and mips2; this should cover 3/4 as well. fixes an ftbfs reported on the debian issue tracker: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=785000 (cherry picked from commit `bf46d0acff`) Change-Id: I2458487c92bd638589fdfec5adb4f22102a5960c	2015-10-15 21:12:28 -07:00
James Zern	326b5fb863	update ChangeLog Change-Id: I794a5d58005bb0934ce0de06483567e8ed6bd8eb	2015-03-10 23:06:09 -07:00
Sam Clegg	a661e50bcb	Disable NEON code on Native Client The NEON assember in libwebp has not yet been ported to Native Client. This changes disables it. Related issue: https://code.google.com/p/nativeclient/issues/detail?id=3205 (cherry picked from commit `ac4f5784a0`) Change-Id: I200291db7aa79d40c1f10cff7622c9b8599e6886	2015-03-10 20:29:08 -07:00
James Zern	fcd94e925a	update ChangeLog Change-Id: I60c273c650a305fe36564ccc5fb1c8d7ea18118f	2015-03-04 11:30:23 -08:00
James Zern	569fe5789e	update NEWS Change-Id: Iade773fc3b9961fcd9b4112a3972cfc68e3670f2	2015-03-03 19:19:50 -08:00
James Zern	bd852f5d81	bump version to 0.4.3 libwebp{,decoder} - 0.4.3 libwebp libtool - 5.3.0 libwebpdecoder libtool - 1.3.0 mux/demux - 0.2.2 (unchanged) libtool - 1.2.0 (unchanged) Change-Id: Ie8c35ffc20c1bfd782bdafd99da6c6b1373022c1	2015-03-03 19:05:40 -08:00
James Zern	2d58b64f51	WebPPictureRescale: add a note about 0 width/height (cherry picked from commit `0f773693bf`) Change-Id: I3890bb3fd32a148d7dd24c714546160c6c59d4ea	2015-03-03 17:53:49 -08:00
James Zern	a0d8ca576f	examples/Android.mk: add webpmux_example target renamed from 'webpmux' to avoid name clash with the library module name (cherry picked from commit `6cef0e4fa4`) Change-Id: I33bbdbdcb25a6f35bd85c9a0dbbb93b2428b05f3	2015-03-03 17:53:49 -08:00
James Zern	34b1d29e3c	Android.mk: add webpmux target (cherry picked from commit `53c16ff047`) Change-Id: I60fc898fd804e23f08d760694192c5d04adcae91	2015-03-03 17:53:49 -08:00
James Zern	75619881e6	Android.mk: add webpdemux target (cherry picked from commit `21852a00a1`) Change-Id: I2fbbefbee59a96c52f5addcfc5bfe1216caad5cc	2015-03-03 17:53:48 -08:00
James Zern	a98757650a	Android.mk: add webpdecoder{,_static} targets webpdecoder_static is reused to create libwebpdecoder.so and libwebp.{a,so} (cherry picked from commit `8697a3bcc8`) Change-Id: I940293cb755040c0ea45dc13f22624de8f355867	2015-03-03 17:53:48 -08:00
James Zern	a6d4859725	Android.mk: split source lists per-directory will allow reuse in future targets (cherry picked from commit `4a67049113`) Conflicts: Android.mk Change-Id: Iededc19d954226e62f2d2383a2b80f268d613647	2015-03-03 17:53:48 -08:00
James Zern	77544d5f5b	fix iOS arm64 build with Xcode 6.3 the standard vtbl functions are available there [1][2]. based on a patch from: aaroncrespo fixes issue #243. [1] http://adcdownload.apple.com//Developer_Tools/Xcode_6.3_beta/Xcode_6.3_beta_Release_Notes.pdf [2] Apple LLVM Compiler Version 6.1 - Xcode 6.3 updates the Apple LLVM compiler to version 6.1.0. [...] Support for the arm64 architecture has been significantly revised to align with ARM's implementation, where the most visible impact is that a few of the vector intrinsics have changed to match ARM's specifications. (cherry picked from commit `602a00f93f`) Change-Id: I79a0016f44b9dbe36d0373f7f00a50ab3c2ca447	2015-03-03 17:53:47 -08:00
James Zern	6dea15784d	doc/webp-container-spec: note MSB order for chunk diagrams addresses question in issue #241 (cherry picked from commit `b510fbfe3b`) Change-Id: Iff6a172d5822f6ec8b9bc0951a1c9cd3f98c9251	2015-03-03 17:53:47 -08:00
James Zern	f7cd57b23d	doc/webp-container-spec: cosmetics partially normalize indent, vertical whitespace and capitalization with the copy used on developers.google.com/speed/webp (cherry picked from commit `e7d3df2314`) Change-Id: I8044418eeb9eaf5bd5c799675c74f6f845d503d6	2015-03-03 17:53:47 -08:00
James Zern	1d6b250b07	vwebp: clear canvas at the beginning of each loop this is in line with the recommendation in the spec, cf., `5603947` webp-container-spec: clarify background clear on loop (cherry picked from commit `1579de3cae`) Change-Id: Id3910395b05a1a1f2804be841b61f97bd4bac593	2015-03-03 17:53:46 -08:00
James Zern	f97b3f86bf	webp-container-spec: clarify background clear on loop at the beginning of the loop there's an implicit clear of the entire canvas to the background (or application defined) color. this avoids adding the final composited frame to the first. (cherry picked from commit `560394798f`) Change-Id: Ia3a52cf4482c6176334a5c9c99a0ddd07d1776e7	2015-03-03 17:53:46 -08:00
James Zern	4ba83c1759	vwebp: remove unnecessary static Help() prototype all uses occur after its declaration (cherry picked from commit `2c906c407c`) Change-Id: I775642ce6d1dec3bc6da2fa0d5d87490992c7e6c	2015-03-03 17:53:46 -08:00
James Zern	d34e8e3d18	vwebp/animation: display last frame on end-of-loop previously the first frame would be redisplayed, which might be unexpected if the final frame was meant to be a composite, for example. (cherry picked from commit `0f017b56f3`) Change-Id: I4da795623c71501e2fa426e8fba8fb2ffcbab58a	2015-03-03 17:53:45 -08:00
James Zern	bbbc524fb4	dec/vp8: clear 'dither_' on skipped blocks DitherRow() only checks this value, not 'skip_' so previously it was uninitialized for these blocks. (cherry picked from commit `66935fb9ee`) Change-Id: I0f698b81854ee9d91edacb51c1e3bdab9cba96f2	2015-03-03 17:53:45 -08:00
James Zern	0339fa26eb	lossless_neon: enable subtract green for aarch64 similar to: `1ba61b0` enable NEON intrinsics in aarch64 builds vtbl1_u8 is available everywhere but Xcode-based iOS arm64 builds, use vtbl1q_u8 there. performance varies based on the input, 1-3% on encode was observed (cherry picked from commit `416e1cea9b`) Change-Id: Ifec35b37eb856acfcf69ed7f16fa078cd40b7034	2015-03-03 17:53:45 -08:00
Urvang Joshi	5a0c2207f4	Regression fix for lossless decoding Reported here: https://code.google.com/p/webp/issues/detail?id=239 At the beginning of method 'DecodeImageData', pixels up to 'dec->last_pixel_' are assumed to be already cached. So, at the end of previous call to that method also, that assumption should hold true. Hence, we should cache all pixels up to 'src' regardless of 'src_last'. This affects lossless incremental decoding only, as that is when src_last and src_end differ. Note: alpha decoding is implicitly incremental, as alpha decoding of only the rows 'y_end - y_start' happens during FinishRow() call. So, this bug affects alpha decoding in non-incremental decoding flow as well. This bug was introduced in: https://gerrit.chromium.org/gerrit/#/c/59716. (cherry picked from commit `783a8cda24`) Change-Id: Ide6edfeb2609b02aff701e1bd9fd776da0a16be0	2015-03-03 17:53:44 -08:00
James Zern	6e3a31d595	wicdec: (msvs) quiet some /analyze warnings add additional return checks and asserts to avoid: C6102: Using 'XXX' from failed function call ... (cherry picked from commit `9b228b5416`) Change-Id: I51f5fa630324e0cd7b2d9fceefecb4f4021474b1	2015-03-03 17:53:44 -08:00
James Zern	b49a578135	dwebp/WritePNG: mark png variables volatile these are used on both sides of the setjmp(). (cherry picked from commit `7a191398ca`) Change-Id: I4a789bfb3a5d56946a22286c5a140008d90e1ba2	2015-03-03 17:53:43 -08:00
James Zern	0a4391a196	dwebp: include setjmp.h w/WEBP_HAVE_PNG setjmp() is used in WritePNG(). (cherry picked from commit `775dfad297`) Change-Id: Iadd836272fc7d368d635c891507ce2a08c4d3dec	2015-03-03 17:53:43 -08:00
James Zern	90f1ec58a9	dwebp: correct sign in format strings width / height are unsigned; fixes a warning with msvs /analyze: C6340: Mismatch on sign: 'const unsigned int' passed as _Param_(4) when some signed type is required in call to 'fprintf'. (cherry picked from commit `47d26be760`) Change-Id: I5f1fad4c93745baf17d70178a5e66579ccd2b155	2015-03-03 17:53:43 -08:00
James Zern	b61ce861f3	VP8LEncodeStream: add an assert check enc->argb_ to quiet an msvs /analyze warning: C6387: 'enc->argb_+y*width' could be '0': this does not adhere to the specification for the function 'memcpy'. (cherry picked from commit `f0e0677b87`) Change-Id: I87544e92ee0d3ea38942a475c30c6d552f9877b7	2015-03-03 17:53:42 -08:00
James Zern	df1081bb82	dsp/cpu: (msvs) add include for __cpuidex and only use it on x86 / x64 where it's available. has the side-effect of quieting a msvs /analyze warning: C6001: Using uninitialized memory 'cpu_info'. (cherry picked from commit `0de5f33e31`) Change-Id: Iae51be3b22b2ee949cfc473eeea9fd9fb6b3c2cb	2015-03-03 17:53:42 -08:00
James Zern	39aa055529	dsp/cpu: (msvs) avoid immintrin.h on _M_ARM _xgetgv() isn't relevant there anyway broken since: `279e661` Merge "dsp/cpu: add include for _xgetbv() w/MSVS" (cherry picked from commit `4fbe9cf202`) Change-Id: Iaa7bc0c5be9c06bfffab39e194c64c09bf5b5a27	2015-03-03 17:53:42 -08:00
James Zern	f814f429ca	dsp/cpu: add include for _xgetbv() w/MSVS explicitly add immintrin.h instead of transitively picking it up via windows.h presumably. makes the code easier to move around. (cherry picked from commit `b6c0428e8c`) Change-Id: If70d5143ac94fc331da763ce034358858e460e06	2015-03-03 17:53:41 -08:00
James Zern	8508ab99a7	cpu: fix AVX2 detection for gcc/clang targets ecx needs to be set to 0; the visual studio builds were already doing this. https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family (cherry picked from commit `d7c4b02a57`) Change-Id: I95efb115b4d50bbdb6b14fca2aa63d0a24974e55	2015-03-03 17:53:41 -08:00
Pascal Massimino	5769623b6f	fix handling of zero-sized partition #0 corner case reported in https://code.google.com/p/webp/issues/detail?id=237 An empty partition #0 should be indicative of a bitstream error. The previous code was correct, only an assert was triggered in debug mode. But we might as well handle the case properly right away... (cherry picked from commit `205c7f26af`) Change-Id: I4dc31a46191fa9e65659c9a5bf5de9605e93f2f5	2015-03-03 17:53:40 -08:00
James Zern	b2e71a9080	make the 'last_cpuinfo_used' variable names unique allows the sources to be #include'd in some hackish builds (don't do that!) (cherry picked from commit `67f601cd46`) Conflicts: src/dsp/alpha_processing.c src/dsp/argb.c src/dsp/dec.c src/dsp/enc.c src/dsp/lossless.c src/dsp/upsampling.c src/dsp/yuv.c Change-Id: I0c7a43acbebd0e2d5068845e6daa8ce47361cd91	2015-03-02 18:43:41 -08:00
Pascal Massimino	1273e84517	add -Wformat-nonliteral and -Wformat-security can be useful, not sure they are a subset of the flags we use already... (cherry picked from commit `80d950d94e`) Change-Id: Iec742a99427a791d9527368302a1136df2ff96cd	2015-03-02 18:43:35 -08:00
Pascal Massimino	3ae78eb757	multi-thread fix: lock each entry points with a static var we compare the current VP8GetCPUInfo pointer to the last used. This is less code overall and each implementation is still testable separately (by just changing VP8GetCPUInfo, but not a separate threads!) (cherry picked from commit `a437694a17`) Conflicts: src/dsp/alpha_processing.c src/dsp/argb.c src/dsp/dec.c src/dsp/enc.c src/dsp/lossless.c src/dsp/upsampling.c src/dsp/yuv.c Change-Id: Ia13fa8ffc4561a884508f6ab71ed0d1b9f1ce59b	2015-03-02 18:43:31 -08:00
James Zern	5c1eeda922	webp-container-spec: remove references to fragments this portion of the format was never finalized (cherry picked from commit `a66e66c79d`) Change-Id: I80aa1b27457a0e52b047c7284df2f58b181ca5d8	2015-03-02 18:43:26 -08:00
James Zern	c5ceea4899	enc_neon: fix building with non-Xcode clang (iOS) check for __apple_build_version__ to distinguish the two; a version check could work as Apple bumped Xcode's to 5.x/6.x, but it's unclear how upstream will deal with their versioning as they go 3.6+, so avoid it for now. (cherry picked from commit `a3946b8956`) Change-Id: I67cda67c4f68e262a92d805a63cc1496374be063	2015-03-02 18:43:20 -08:00
James Zern	d0859d69de	iosbuild: add x64_64 simulator support based on the patch here: https://github.com/pixelkind/webp-ios-build (cherry picked from commit `a96ccf8fde`) Change-Id: Iaa346b751e5f18e8cf13a8e5c4064b0c2a3f5f6c	2015-03-02 18:43:14 -08:00
Urvang Joshi	046732ca65	WebPEncode: Support encoding same pic twice (even if modified) This wasn't working for this specific scenario: - Encode an RGBA 'pic' (with trivial alpha) using lossy encoding. (so that pic->a == NULL after import happens). - Modify the 'pic->argb' so that it has non-trivial alpha. - Encode the same 'pic' again. This used to fail to encode alpha data as pic->a == NULL. (cherry picked from commit `e4f4dddba3`) Change-Id: Ieaaa7bd09825c42f54fbd99e6781d98f0b19cc0c	2015-03-02 18:43:08 -08:00
James Zern	4426f50179	webp/types.h: use inline for clang++/-std=c++11 at least clang 3.[45] in c++ mode with -std=c++11 define __STRICT_ANSI__ this change set WEBP_INLINE to inline for c++/non-strict-ansi/> c99 fixes crbug.com/428383 (cherry picked from commit `6638710b9e`) Change-Id: Ief2b934353c336a75865c73c90cc3dc5e4f83913	2015-03-02 18:43:02 -08:00
Urvang Joshi	e297fc7171	gif2webp: Use the default hint instead of WEBP_HINT_GRAPH. This is much faster and the compression is slightly better too. (cherry picked from commit `c94ed49efd`) Change-Id: Ibf0d10eea83bfabfcc44ee497074767462ff41b1	2015-03-02 18:42:54 -08:00
James Zern	855fe4354b	Makefile.vc: add a 'legacy' RTLIBCFG option disables buffer security checks (/GS-) and any machine optimizations (e.g., sse2) fixes issue #228 (cherry picked from commit `34c20c06c8`) Change-Id: I81fa483dc1654199b2017626320383d2d63317dc	2015-03-02 18:42:48 -08:00
Urvang Joshi	b7eb6d55c7	gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS Tweaked the gif2webp_util API to support this. Requested in: https://code.google.com/p/webp/issues/detail?id=144 (cherry picked from commit `65e5eb8a62`) Change-Id: I0e8c4edc39227355cd8d3acc55795186e25d0c3a	2015-03-02 18:42:40 -08:00
Urvang Joshi	5691bdd9da	gif2webp: Handle frames with odd offsets + disposal to background. Snapping odd offsets in GIF to even offsets in WebP was causing extra row/column being disposed in such cases. Code is rewritten to maintain previous and current canvas (it used to maintain previous canvas and current frame earlier). And we recompute change rectangles as those from GIF may no longer apply. Also, this renders methods like ReduceTransparency() and ConvertToKeyFrame() redundant, as internally maintained current canvas is always independent of previous canvases. Disposal method choice: we pick the disposal method that results in the smallest change rectangle. (cherry picked from commit `e4c829efe9`) Conflicts: examples/gif2webp_util.c Change-Id: Ic31186d98fe1a2a790a89d1571b17e3abd127e79	2015-03-02 18:42:36 -08:00
James Zern	8301da1380	stopwatch.h: fix includes WEBP_INLINE -> webp/types.h memcpy -> string.h (cherry picked from commit `54edbf65ff`) Change-Id: Iab2ea8b553dc98be75eede751de62ab0292d1f97	2015-03-02 18:42:28 -08:00