update ChangeLog

Change-Id: I05b9ac2cf8fdd26e3f2e68780dca3e613bc2f732
Clarify the expected 'config' lifespan in WebPIDecode()
2025-07-15 13:29:54 +02:00 · 2016-07-06 18:00:16 -07:00 · 2016-07-06 17:17:15 -07:00 · 2016-06-27 16:49:43 -07:00 · 2016-06-27 20:01:44 +02:00 · 2016-06-24 19:29:43 -07:00
203 changed files with 29498 additions and 10532 deletions
--- a/.gitignore
+++ b/.gitignore
@ -19,6 +19,7 @@
 /stamp-h1
 Makefile
 Makefile.in
+examples/anim_diff
 examples/[cdv]webp
 examples/gif2webp
 examples/webpmux
@ -30,3 +31,6 @@ src/webp/stamp-h1
 *.pdb
 /iosbuild
 /WebP.framework
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
--- a/.mailmap
+++ b/.mailmap
@ -6,3 +6,6 @@ Vikas Arora <vikasa@google.com>
 <vikasa@google.com> <vikasa@gmail.com>
 <vikasa@google.com> <vikaas.arora@gmail.com>
 <slobodan.prijic@imgtec.com> <Slobodan.Prijic@imgtec.com>
+<vrabaud@google.com> <vincent.rabaud@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
+<qrczak@google.com> <qrczak>
--- a/12
+++ b/12
@ -7,19 +7,29 @@ Contributors:
 - Johann (johann dot koenig at duck dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
- levytamar82 (tamar dot levy at intel dot com)
+- Lode Vandevenne (lode at google dot com)
 - Lou Quillio (louquillio at google dot com)
 - Mans Rullgard (mans at mansr dot com)
+- Marcin Kowalczyk (qrczak at google dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
+- Mislav Bradac (mislavm at google dot com)
+- Nico Weber (thakis at chromium dot org)
 - Noel Chromium (noel at chromium dot org)
+- Parag Salasakar (img dot mips1 at gmail dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Sam Clegg (sbc at chromium dot org)
+- Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
 - Scott Talbot (s at chikachow dot org)
 - Slobodan Prijic (slobodan dot prijic at imgtec dot com)
 - Somnath Banerjee (somnath dot banerjee at gmail dot com)
+- Sriraman Tallam (tmsriram at google dot com)
+- Tamar Levy (tamar dot levy at intel dot com)
 - Timothy Gu (timothygu99 at gmail dot com)
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
+- Vincent Rabaud (vrabaud at google dot com)
+- Yang Zhang (yang dot zhang at arm dot com)
--- a/Android.mk
+++ b/Android.mk
@ -15,6 +15,7 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  NEON := c.neon
+  USE_CPUFEATURES := yes
 else
  NEON := c
 endif
@ -32,34 +33,65 @@ dec_srcs := \
    src/dec/webp.c \

 demux_srcs := \
+    src/demux/anim_decode.c \
    src/demux/demux.c \

 dsp_dec_srcs := \
    src/dsp/alpha_processing.c \
+    src/dsp/alpha_processing_mips_dsp_r2.c \
    src/dsp/alpha_processing_sse2.c \
+    src/dsp/alpha_processing_sse41.c \
+    src/dsp/argb.c \
+    src/dsp/argb_mips_dsp_r2.c \
+    src/dsp/argb_sse2.c \
    src/dsp/cpu.c \
    src/dsp/dec.c \
    src/dsp/dec_clip_tables.c \
    src/dsp/dec_mips32.c \
+    src/dsp/dec_mips_dsp_r2.c \
+    src/dsp/dec_msa.c \
    src/dsp/dec_neon.$(NEON) \
    src/dsp/dec_sse2.c \
+    src/dsp/dec_sse41.c \
+    src/dsp/filters.c \
+    src/dsp/filters_mips_dsp_r2.c \
+    src/dsp/filters_sse2.c \
    src/dsp/lossless.c \
-    src/dsp/lossless_mips32.c \
+    src/dsp/lossless_mips_dsp_r2.c \
    src/dsp/lossless_neon.$(NEON) \
    src/dsp/lossless_sse2.c \
+    src/dsp/rescaler.c \
+    src/dsp/rescaler_mips32.c \
+    src/dsp/rescaler_mips_dsp_r2.c \
+    src/dsp/rescaler_neon.$(NEON) \
+    src/dsp/rescaler_sse2.c \
    src/dsp/upsampling.c \
+    src/dsp/upsampling_mips_dsp_r2.c \
    src/dsp/upsampling_neon.$(NEON) \
    src/dsp/upsampling_sse2.c \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
+    src/dsp/yuv_mips_dsp_r2.c \
    src/dsp/yuv_sse2.c \

 dsp_enc_srcs := \
+    src/dsp/cost.c \
+    src/dsp/cost_mips32.c \
+    src/dsp/cost_mips_dsp_r2.c \
+    src/dsp/cost_sse2.c \
    src/dsp/enc.c \
    src/dsp/enc_avx2.c \
    src/dsp/enc_mips32.c \
+    src/dsp/enc_mips_dsp_r2.c \
    src/dsp/enc_neon.$(NEON) \
    src/dsp/enc_sse2.c \
+    src/dsp/enc_sse41.c \
+    src/dsp/lossless_enc.c \
+    src/dsp/lossless_enc_mips32.c \
+    src/dsp/lossless_enc_mips_dsp_r2.c \
+    src/dsp/lossless_enc_neon.$(NEON) \
+    src/dsp/lossless_enc_sse2.c \
+    src/dsp/lossless_enc_sse41.c \

 enc_srcs := \
    src/enc/alpha.c \
@ -67,10 +99,12 @@ enc_srcs := \
    src/enc/backward_references.c \
    src/enc/config.c \
    src/enc/cost.c \
+    src/enc/delta_palettization.c \
    src/enc/filter.c \
    src/enc/frame.c \
    src/enc/histogram.c \
    src/enc/iterator.c \
+    src/enc/near_lossless.c \
    src/enc/picture.c \
    src/enc/picture_csp.c \
    src/enc/picture_psnr.c \
@ -84,6 +118,7 @@ enc_srcs := \
    src/enc/webpenc.c \

 mux_srcs := \
+    src/mux/anim_encode.c \
    src/mux/muxedit.c \
    src/mux/muxinternal.c \
    src/mux/muxread.c \
@ -120,7 +155,9 @@ LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
 # prefer arm over thumb mode for performance gains
 LOCAL_ARM_MODE := arm

-LOCAL_STATIC_LIBRARIES := cpufeatures
+ifeq ($(USE_CPUFEATURES),yes)
+  LOCAL_STATIC_LIBRARIES := cpufeatures
+endif

 LOCAL_MODULE := webpdecoder_static

@ -212,4 +249,6 @@ endif

 include $(LOCAL_PATH)/examples/Android.mk

-$(call import-module,android/cpufeatures)
+ifeq ($(USE_CPUFEATURES),yes)
+  $(call import-module,android/cpufeatures)
+endif
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,333 @@
+cmake_minimum_required(VERSION 2.8.7)
+
+project(libwebp C)
+
+# Options for coder / decoder executables.
+option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." OFF)
+option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." OFF)
+option(WEBP_EXPERIMENTAL_FEATURES "Build with experimental features." OFF)
+option(WEBP_FORCE_ALIGNED "Force aligned memory operations." OFF)
+
+set(WEBP_DEP_LIBRARIES)
+set(WEBP_DEP_INCLUDE_DIRS)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release" CACHE
+    "Build type: Release, Debug or RelWithDebInfo" STRING FORCE
+  )
+endif()
+
+################################################################################
+# Generate the config.h to compile with specific intrinsics / libs.
+
+## Check for compiler options.
+include(CheckCSourceCompiles)
+check_c_source_compiles("
+    int main(void) {
+      (void)__builtin_bswap16(0);
+      return 0;
+    }
+  "
+  HAVE_BUILTIN_BSWAP16
+)
+check_c_source_compiles("
+    int main(void) {
+      (void)__builtin_bswap32(0);
+      return 0;
+    }
+  "
+  HAVE_BUILTIN_BSWAP32
+)
+check_c_source_compiles("
+    int main(void) {
+      (void)__builtin_bswap64(0);
+      return 0;
+    }
+  "
+  HAVE_BUILTIN_BSWAP64
+)
+
+## Check for libraries.
+find_package(Threads)
+if(Threads_FOUND)
+  if(CMAKE_USE_PTHREADS_INIT)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
+  endif()
+  foreach(PTHREAD_TEST HAVE_PTHREAD_PRIO_INHERIT PTHREAD_CREATE_UNDETACHED)
+    check_c_source_compiles("
+        #include <pthread.h>
+        int main (void) {
+          int attr = ${PTHREAD_TEST};
+          return attr;
+        }
+      " ${PTHREAD_TEST}
+    )
+  endforeach()
+  list(APPEND WEBP_DEP_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+endif()
+set(WEBP_USE_THREAD ${Threads_FOUND})
+
+# TODO: this seems unused, check with autotools.
+set(LT_OBJDIR ".libs/")
+
+# Only useful for vwebp, so useless for now.
+# find_package(OpenGL)
+# set(WEBP_HAVE_GL ${OPENGL_FOUND})
+# set(WEBP_DEP_INCLUDE_DIRS ${WEBP_DEP_INCLUDE_DIRS} ${OPENGL_INCLUDE_DIRS})
+# set(WEBP_DEP_LIBRARIES ${WEBP_DEP_LIBRARIES} ${OPENGL_LIBRARIES})
+
+# Find the standard C math library.
+find_library(MATH_LIBRARY NAMES m)
+if(MATH_LIBRARY)
+  list(APPEND WEBP_DEP_LIBRARIES ${MATH_LIBRARY})
+endif()
+
+# Find the standard image libraries.
+set(WEBP_DEP_IMG_LIBRARIES)
+set(WEBP_DEP_IMG_INCLUDE_DIRS)
+foreach(I_LIB PNG JPEG TIFF GIF)
+  find_package(${I_LIB})
+  set(WEBP_HAVE_${I_LIB} ${${I_LIB}_FOUND})
+  if(${I_LIB}_FOUND)
+    list(APPEND WEBP_DEP_IMG_LIBRARIES ${${I_LIB}_LIBRARIES})
+    list(APPEND WEBP_DEP_IMG_INCLUDE_DIRS ${${I_LIB}_INCLUDE_DIRS})
+  endif()
+endforeach()
+
+## Check for specific headers.
+include(CheckIncludeFiles)
+check_include_files("stdlib.h;stdarg.h;string.h;float.h" STDC_HEADERS)
+check_include_files(dlfcn.h HAVE_DLFCN_H)
+check_include_files(GLUT/glut.h HAVE_GLUT_GLUT_H)
+check_include_files(GL/glut.h HAVE_GL_GLUT_H)
+check_include_files(inttypes.h HAVE_INTTYPES_H)
+check_include_files(memory.h HAVE_MEMORY_H)
+check_include_files(OpenGL/glut.h HAVE_OPENGL_GLUT_H)
+check_include_files(shlwapi.h HAVE_SHLWAPI_H)
+check_include_files(stdint.h HAVE_STDINT_H)
+check_include_files(stdlib.h HAVE_STDLIB_H)
+check_include_files(strings.h HAVE_STRINGS_H)
+check_include_files(string.h HAVE_STRING_H)
+check_include_files(sys/stat.h HAVE_SYS_STAT_H)
+check_include_files(sys/types.h HAVE_SYS_TYPES_H)
+check_include_files(unistd.h HAVE_UNISTD_H)
+check_include_files(wincodec.h HAVE_WINCODEC_H)
+check_include_files(windows.h HAVE_WINDOWS_H)
+
+# Windows specifics
+if(HAVE_WINCODEC_H)
+  list(APPEND WEBP_DEP_LIBRARIES shlwapi ole32 windowscodecs)
+endif()
+
+## Check for SIMD extensions.
+set(WEBP_SIMD_FLAGS "SSE2;SSE41;AVX2")
+set(WEBP_SIMD_FILE_EXTENSIONS "_sse2.c;_sse41.c;_avx2.c")
+if(MSVC)
+  # MSVC does not have a SSE4 flag but AVX2 support implies
+  # SSE4 support.
+  set(SIMD_ENABLE_FLAGS "/arch:SSE2;/arch:AVX2;/arch:AVX2")
+  set(SIMD_DISABLE_FLAGS)
+else()
+  set(SIMD_ENABLE_FLAGS "-msse2;-msse4.1;-mavx2")
+  set(SIMD_DISABLE_FLAGS "-mno-sse2;-mno-sse4.1;-mno-avx2")
+endif()
+
+set(WEBP_SIMD_FILES_TO_NOT_INCLUDE)
+set(WEBP_SIMD_FILES_TO_INCLUDE)
+set(WEBP_SIMD_FLAGS_TO_INCLUDE)
+
+list(LENGTH WEBP_SIMD_FLAGS WEBP_SIMD_FLAGS_LENGTH)
+math(EXPR WEBP_SIMD_FLAGS_RANGE "${WEBP_SIMD_FLAGS_LENGTH} - 1")
+
+foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
+  list(GET WEBP_SIMD_FLAGS ${I_SIMD} WEBP_SIMD_FLAG)
+  list(GET SIMD_ENABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
+  set(CMAKE_REQUIRED_FLAGS ${SIMD_COMPILE_FLAG})
+  check_c_source_compiles("
+      #include \"${CMAKE_CURRENT_LIST_DIR}/src/dsp/dsp.h\"
+      int main(void) {
+        #if !defined(WEBP_USE_${WEBP_SIMD_FLAG})
+        this is not valid code
+        #endif
+        return 0;
+      }
+    "
+    WEBP_HAVE_${WEBP_SIMD_FLAG}
+  )
+
+  # Check which files we should include or not.
+  list(GET WEBP_SIMD_FILE_EXTENSIONS ${I_SIMD} WEBP_SIMD_FILE_EXTENSION)
+  file(GLOB SIMD_FILES RELATIVE ${CMAKE_CURRENT_LIST_DIR}
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/dsp/*${WEBP_SIMD_FILE_EXTENSION}"
+  )
+  if(WEBP_HAVE_${WEBP_SIMD_FLAG})
+    # Memorize the file and flags.
+    foreach(FILE ${SIMD_FILES})
+      list(APPEND WEBP_SIMD_FILES_TO_INCLUDE ${FILE})
+      list(APPEND WEBP_SIMD_FLAGS_TO_INCLUDE ${SIMD_COMPILE_FLAG})
+    endforeach()
+  else()
+    # Remove the file from the list.
+    foreach(FILE ${SIMD_FILES})
+      list(APPEND WEBP_SIMD_FILES_NOT_TO_INCLUDE ${FILE})
+    endforeach()
+    # Explicitly disable SIMD.
+    if(SIMD_DISABLE_FLAGS)
+      list(GET SIMD_DISABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
+      include(CheckCCompilerFlag)
+      check_c_compiler_flag(${SIMD_COMPILE_FLAG} HAS_COMPILE_FLAG)
+      if(HAS_COMPILE_FLAG)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_COMPILE_FLAG}")
+      endif()
+    endif()
+  endif()
+endforeach()
+
+## Define extra info.
+set(PACKAGE ${PROJECT_NAME})
+set(PACKAGE_NAME ${PROJECT_NAME})
+
+# Read from configure.ac.
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/configure.ac CONFIGURE_AC)
+string(REGEX MATCHALL "\\[([0-9a-z\\.:/]*)\\]"
+  CONFIGURE_AC_PACKAGE_INFO ${CONFIGURE_AC}
+)
+function(strip_bracket VAR)
+  string(LENGTH ${${VAR}} TMP_LEN)
+  math(EXPR TMP_LEN ${TMP_LEN}-2)
+  string(SUBSTRING ${${VAR}} 1 ${TMP_LEN} TMP_SUB)
+  set(${VAR} ${TMP_SUB} PARENT_SCOPE)
+endfunction()
+
+list(GET CONFIGURE_AC_PACKAGE_INFO 1 PACKAGE_VERSION)
+strip_bracket(PACKAGE_VERSION)
+list(GET CONFIGURE_AC_PACKAGE_INFO 2 PACKAGE_BUGREPORT)
+strip_bracket(PACKAGE_BUGREPORT)
+list(GET CONFIGURE_AC_PACKAGE_INFO 3 PACKAGE_URL)
+strip_bracket(PACKAGE_URL)
+
+# Build more info.
+set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
+set(PACKAGE_TARNAME ${PACKAGE_NAME})
+set(VERSION ${PACKAGE_VERSION})
+
+## Generate the config.h header.
+configure_file(${CMAKE_CURRENT_LIST_DIR}/cmake/config.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/webp/config.h)
+add_definitions(-DHAVE_CONFIG_H)
+# The webp folder is included as we reference config.h as
+# ../webp/config.h or webp/config.h
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include
+  ${CMAKE_CURRENT_BINARY_DIR}/include/webp
+)
+
+
+################################################################################
+# WebP source files.
+# Read the Makefile.am to get the source files.
+set(WEBP_SRCS)
+
+function(parse_Makefile_am FOLDER WEBP_SRCS)
+  file(READ ${FOLDER}/Makefile.am MAKEFILE_AM)
+  string(REGEX MATCHALL "_SOURCES \\+= [^\n]*"
+    FILES_PER_LINE ${MAKEFILE_AM}
+  )
+  set(SRCS ${WEBP_SRCS})
+  foreach(FILES ${FILES_PER_LINE})
+    string(SUBSTRING ${FILES} 12 -1 FILES)
+    string(REGEX MATCHALL "[0-9a-z\\._]+"
+      FILES ${FILES}
+    )
+    foreach(FILE ${FILES})
+      list(APPEND SRCS ${FOLDER}/${FILE})
+    endforeach()
+  endforeach()
+  set(WEBP_SRCS ${SRCS} PARENT_SCOPE)
+endfunction()
+
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/dec "${WEBP_SRCS}")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/demux "${WEBP_SRCS}")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/dsp "${WEBP_SRCS}")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/enc "${WEBP_SRCS}")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/utils "${WEBP_SRCS}")
+
+# Remove the files specific to SIMD we don't user.
+foreach(FILE ${WEBP_SIMD_FILES_NOT_TO_INCLUDE})
+  list(REMOVE_ITEM WEBP_SRCS ${FILE})
+endforeach()
+
+# Build the library.
+add_definitions(-Wall)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ ${WEBP_DEP_INCLUDE_DIRS})
+add_library(webp ${WEBP_SRCS})
+target_link_libraries(webp ${WEBP_DEP_LIBRARIES})
+
+# Change the compile flags for SIMD files we use.
+list(LENGTH WEBP_SIMD_FILES_TO_INCLUDE WEBP_SIMD_FILES_TO_INCLUDE_LENGTH)
+math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
+  "${WEBP_SIMD_FILES_TO_INCLUDE_LENGTH}-1"
+)
+
+foreach(I_FILE RANGE ${WEBP_SIMD_FILES_TO_INCLUDE_RANGE})
+  list(GET WEBP_SIMD_FILES_TO_INCLUDE ${I_FILE} FILE)
+  list(GET WEBP_SIMD_FLAGS_TO_INCLUDE ${I_FILE} SIMD_COMPILE_FLAG)
+  set_source_files_properties(${FILE} PROPERTIES
+    COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
+  )
+endforeach()
+
+# Build the executables if asked for.
+if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP)
+  # Example utility library.
+  set(exampleutil_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/example_util.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/example_util.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
+  add_library(exampleutil ${exampleutil_SRCS})
+  target_link_libraries(exampleutil webp ${WEBP_DEP_LIBRARIES})
+endif()
+
+if(WEBP_BUILD_CWEBP)
+  # Image-decoding utility library.
+  set(exampledec_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/image_dec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/image_dec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/jpegdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/jpegdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/metadata.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/metadata.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/pngdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/pngdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/tiffdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/tiffdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/webpdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/webpdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/wicdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/wicdec.h)
+  add_library(exampledec ${exampledec_SRCS})
+  target_link_libraries(exampledec webp ${WEBP_DEP_LIBRARIES}
+    ${WEBP_DEP_IMG_LIBRARIES})
+endif()
+
+if(WEBP_BUILD_DWEBP)
+  # dwebp
+  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
+  add_executable(dwebp
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/dwebp.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h
+  )
+  target_link_libraries(dwebp webp exampleutil ${WEBP_DEP_LIBRARIES}
+    ${WEBP_DEP_IMG_LIBRARIES}
+  )
+endif()
+
+if(WEBP_BUILD_CWEBP)
+  # cwebp
+  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
+  add_executable(cwebp
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/cwebp.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
+  target_link_libraries(cwebp exampledec webp exampleutil
+    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
+  )
+endif()
--- a/1060
+++ b/1060
--- a/Makefile.vc
+++ b/Makefile.vc
@ -11,6 +11,8 @@ LIBWEBPDEMUX_BASENAME = libwebpdemux
 ARCH = x86
 !ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
 ARCH = x64
+!ELSE IF ! [ cl 2>&1 | find "ARM" > NUL ]
+ARCH = ARM
 !ELSE
 !ERROR Unable to auto-detect toolchain architecture! \
 If cl.exe is in your PATH rerun nmake with ARCH=<arch>.
@ -29,7 +31,6 @@ CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
 CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
-CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
 LNKDLL     = link.exe /DLL $(NOLOGO)
@ -37,6 +38,12 @@ LNKEXE     = link.exe $(NOLOGO)
 LNKLIB     = lib.exe $(NOLOGO)
 MT         = mt.exe $(NOLOGO)

+!IF "$(ARCH)" == "ARM"
+CFLAGS = $(CFLAGS) /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP /DWEBP_USE_THREAD
+!ELSE
+CFLAGS = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
+!ENDIF
+
 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
 OUTDIR = ..\obj\
@ -111,9 +118,7 @@ LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME).lib
 LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME).lib
 LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME).lib
 !ELSE IF "$(DLLBUILD)" == "TRUE"
-DLLC   = webp_dll.c
 DLLINC = webp_dll.h
-DLL_OBJS = $(DIROBJ)\$(DLLC:.c=.obj)
 CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
 LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME)_dll.lib
 LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME)_dll.lib
@ -142,6 +147,8 @@ CFGSET = TRUE
 !MESSAGE .                                  features enabled.
 !MESSAGE - (empty)                        - build libwebp-based targets for CFG
 !MESSAGE - all                            - build (de)mux-based targets for CFG
+!MESSAGE - gif2webp                       - requires libgif & >= VS2013
+!MESSAGE - anim_diff                      - requires libgif & >= VS2013
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
 !MESSAGE   'legacy' will produce a Windows 2000 compatible library.
@ -174,36 +181,71 @@ DEC_OBJS = \
    $(DIROBJ)\dec\webp.obj \

 DEMUX_OBJS = \
+    $(DIROBJ)\demux\anim_decode.obj \
    $(DIROBJ)\demux\demux.obj \

 DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\alpha_processing.obj \
+    $(DIROBJ)\dsp\alpha_processing_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\alpha_processing_sse2.obj \
+    $(DIROBJ)\dsp\alpha_processing_sse41.obj \
    $(DIROBJ)\dsp\cpu.obj \
    $(DIROBJ)\dsp\dec.obj \
    $(DIROBJ)\dsp\dec_clip_tables.obj \
    $(DIROBJ)\dsp\dec_mips32.obj \
+    $(DIROBJ)\dsp\dec_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\dec_msa.obj \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
+    $(DIROBJ)\dsp\dec_sse41.obj \
+    $(DIROBJ)\dsp\filters.obj \
+    $(DIROBJ)\dsp\filters_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\filters_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
-    $(DIROBJ)\dsp\lossless_mips32.obj \
+    $(DIROBJ)\dsp\lossless_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
+    $(DIROBJ)\dsp\rescaler.obj \
+    $(DIROBJ)\dsp\rescaler_mips32.obj \
+    $(DIROBJ)\dsp\rescaler_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\rescaler_neon.obj \
+    $(DIROBJ)\dsp\rescaler_sse2.obj \
    $(DIROBJ)\dsp\upsampling.obj \
+    $(DIROBJ)\dsp\upsampling_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\upsampling_neon.obj \
    $(DIROBJ)\dsp\upsampling_sse2.obj \
    $(DIROBJ)\dsp\yuv.obj \
    $(DIROBJ)\dsp\yuv_mips32.obj \
+    $(DIROBJ)\dsp\yuv_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\yuv_sse2.obj \

 DSP_ENC_OBJS = \
+    $(DIROBJ)\dsp\argb.obj \
+    $(DIROBJ)\dsp\argb_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\argb_sse2.obj \
+    $(DIROBJ)\dsp\cost.obj \
+    $(DIROBJ)\dsp\cost_mips32.obj \
+    $(DIROBJ)\dsp\cost_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\cost_sse2.obj \
    $(DIROBJ)\dsp\enc.obj \
    $(DIROBJ)\dsp\enc_avx2.obj \
    $(DIROBJ)\dsp\enc_mips32.obj \
+    $(DIROBJ)\dsp\enc_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\enc_neon.obj \
    $(DIROBJ)\dsp\enc_sse2.obj \
+    $(DIROBJ)\dsp\enc_sse41.obj \
+    $(DIROBJ)\dsp\lossless_enc.obj \
+    $(DIROBJ)\dsp\lossless_enc_mips32.obj \
+    $(DIROBJ)\dsp\lossless_enc_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\lossless_enc_neon.obj \
+    $(DIROBJ)\dsp\lossless_enc_sse2.obj \
+    $(DIROBJ)\dsp\lossless_enc_sse41.obj \
+
+EX_ANIM_UTIL_OBJS = \
+    $(DIROBJ)\examples\anim_util.obj \

 EX_FORMAT_DEC_OBJS = \
+    $(DIROBJ)\examples\image_dec.obj \
    $(DIROBJ)\examples\jpegdec.obj \
    $(DIROBJ)\examples\metadata.obj \
    $(DIROBJ)\examples\pngdec.obj \
@ -211,6 +253,9 @@ EX_FORMAT_DEC_OBJS = \
    $(DIROBJ)\examples\webpdec.obj \
    $(DIROBJ)\examples\wicdec.obj \

+EX_GIF_DEC_OBJS = \
+    $(DIROBJ)\examples\gifdec.obj \
+
 EX_UTIL_OBJS = \
    $(DIROBJ)\examples\example_util.obj \

@ -220,10 +265,12 @@ ENC_OBJS = \
    $(DIROBJ)\enc\backward_references.obj \
    $(DIROBJ)\enc\config.obj \
    $(DIROBJ)\enc\cost.obj \
+    $(DIROBJ)\enc\delta_palettization.obj \
    $(DIROBJ)\enc\filter.obj \
    $(DIROBJ)\enc\frame.obj \
    $(DIROBJ)\enc\histogram.obj \
    $(DIROBJ)\enc\iterator.obj \
+    $(DIROBJ)\enc\near_lossless.obj \
    $(DIROBJ)\enc\picture.obj \
    $(DIROBJ)\enc\picture_csp.obj \
    $(DIROBJ)\enc\picture_psnr.obj \
@ -237,6 +284,7 @@ ENC_OBJS = \
    $(DIROBJ)\enc\webpenc.obj \

 MUX_OBJS = \
+    $(DIROBJ)\mux\anim_encode.obj \
    $(DIROBJ)\mux\muxedit.obj \
    $(DIROBJ)\mux\muxinternal.obj \
    $(DIROBJ)\mux\muxread.obj \
@ -264,19 +312,34 @@ LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
 LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)

 OUT_LIBS = $(LIBWEBPDECODER) $(LIBWEBP)
+!IF "$(ARCH)" == "ARM"
+ex: $(OUT_LIBS)
+all: ex
+!ELSE
 OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
 EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe

 ex: $(OUT_LIBS) $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
+# NB: gif2webp.exe and anim_diff.exe are excluded from 'all' as libgif requires
+# C99 support which is only available from VS2013 onward.
+gif2webp: $(DIRBIN)\gif2webp.exe
+anim_diff: $(DIRBIN)\anim_diff.exe
+
+$(DIRBIN)\anim_diff.exe: $(DIROBJ)\examples\anim_diff.obj $(EX_ANIM_UTIL_OBJS)
+$(DIRBIN)\anim_diff.exe: $(EX_UTIL_OBJS)
+$(DIRBIN)\anim_diff.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(EX_FORMAT_DEC_OBJS)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj
+$(DIRBIN)\gif2webp.exe: $(DIROBJ)\examples\gif2webp.obj $(EX_GIF_DEC_OBJS)
+$(DIRBIN)\gif2webp.exe: $(EX_UTIL_OBJS) $(LIBWEBPMUX) $(LIBWEBP)
 $(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj
 $(DIRBIN)\vwebp.exe: $(EX_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(LIBWEBP)
 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
 $(EX_UTIL_OBJS) $(EX_FORMAT_DEC_OBJS): $(OUTPUT_DIRS)
+!ENDIF  # ARCH == ARM

 experimental:
 	$(MAKE) /f Makefile.vc \
@ -292,7 +355,7 @@ $(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): $(OUTPUT_DIRS)

 !IF "$(DLLBUILD)" == "TRUE"
 $(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): \
-    $(DIROBJ)\$(DLLINC) $(DIROBJ)\$(DLLC)
+    $(DIROBJ)\$(DLLINC)

 {$(DIROBJ)}.c{$(DIROBJ)}.obj:
 	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$@  $<
@ -305,7 +368,7 @@ $(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
 	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y

 clean::
-	@-erase /s $(DIROBJ)\$(DLLC) $(DIROBJ)\$(DLLINC) 2> NUL
+	@-erase /s $(DIROBJ)\$(DLLINC) 2> NUL
 !ELSE
 $(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
 	$(LNKLIB) /out:$@ $**
@ -322,22 +385,24 @@ $(DIROBJ)\$(DLLINC):
 	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
 	@echo #endif  /* WEBP_DLL_H_ */ >> $@

-# expose a WebPFree() function for use in managed code
-$(DIROBJ)\$(DLLC): $(DIROBJ)\$(DLLINC)
-	@echo #include ^<stdlib.h^> > $@
-	@echo #include "webp_dll.h" >> $@
-	@echo // This function should be used in place of free() for memory >> $@
-	@echo // returned by the WebP API. >> $@
-	@echo WEBP_EXTERN(void) WebPFree(void* ptr) { >> $@
-	@echo   free(ptr); >> $@
-	@echo } >> $@
-
 .SUFFIXES: .c .obj .res .exe
 # File-specific flag builds. Note batch rules take precedence over wildcards,
 # so for now name each file individually.
 $(DIROBJ)\dsp\enc_avx2.obj: src\dsp\enc_avx2.c
 	$(CC) $(CFLAGS) $(AVX2_FLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ \
 	  src\dsp\$(@B).c
+$(DIROBJ)\examples\anim_diff.obj: examples\anim_diff.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\anim_util.obj: examples\anim_util.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\gif2webp.obj: examples\gif2webp.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\gifdec.obj: examples\gifdec.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
 # Batch rules
 {examples}.c{$(DIROBJ)\examples}.obj::
 	$(CC) $(CFLAGS) /Fd$(DIROBJ)\examples\ /Fo$(DIROBJ)\examples\ $<
--- a/47
+++ b/47
@ -1,3 +1,47 @@
+- 6/14/2016: version 0.5.1
+  This is a binary compatible release.
+  * miscellaneous bug fixes (issues #280, #289)
+  * reverted alpha plane encoding with color cache for compatibility with
+    libwebp 0.4.0->0.4.3 (issues #291, #298)
+  * lossless encoding performance improvements
+  * memory reduction in both lossless encoding and decoding
+  * force mux output to be in the extended format (VP8X) when undefined chunks
+    are present (issue #294)
+  * gradle, cmake build support
+  * workaround for compiler bug causing 64-bit decode failures on android
+    devices using clang-3.8 in the r11c NDK
+  * various WebPAnimEncoder improvements
+
+- 12/17/2015: version 0.5.0
+  * miscellaneous bug & build fixes (issues #234, #258, #274, #275, #278)
+  * encoder & decoder speed-ups on x86/ARM/MIPS for lossy & lossless
+    - note! YUV->RGB conversion was sped-up, but the results will be slightly
+      different from previous releases
+  * various lossless encoder improvements
+  * gif2webp improvements, -min_size option added
+  * tools fully support input from stdin and output to stdout (issue #168)
+  * New WebPAnimEncoder API for creating animations
+  * New WebPAnimDecoder API for decoding animations
+  * other API changes:
+    - libwebp:
+      WebPPictureSmartARGBToYUVA() (-pre 4 in cwebp)
+      WebPConfig::exact (-exact in cwebp; -alpha_cleanup is now the default)
+      WebPConfig::near_lossless (-near_lossless in cwebp)
+      WebPFree() (free'ing webp allocated memory in other languages)
+      WebPConfigLosslessPreset()
+      WebPMemoryWriterClear()
+    - libwebpdemux: removed experimental fragment related fields and functions
+    - libwebpmux: WebPMuxSetCanvasSize()
+  * new libwebpextras library with some uncommon import functions:
+    WebPImportGray/WebPImportRGB565/WebPImportRGB4444
+
+- 10/15/15: version 0.4.4
+  This is a binary compatible release.
+  * rescaling out-of-bounds read fix (issue #254)
+  * various build fixes and improvements (issues #253, #259, #262, #267, #268)
+  * container documentation update
+  * gif2webp transparency fix (issue #245)
+
 - 3/3/15: version 0.4.3
  This is a binary compatible release.
  * Android / gcc / iOS / MSVS build fixes and improvements
@ -89,7 +133,8 @@
 - 9/19/11: version 0.1.3
  * Advanced decoding APIs.
  * On-the-fly cropping and rescaling of images.
-  * SSE2 instructions for decoding performance optimizations on x86 based platforms.
+  * SSE2 instructions for decoding performance optimizations on x86 based
+    platforms.
  * Support Multi-threaded decoding.
  * 40% improvement in Decoding performance.
  * Add support for RGB565, RGBA4444 & ARGB image colorspace.
--- a/2
+++ b/2
@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
--- a/139
+++ b/139
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.3
+               \____/____/\_____/_____/____/v0.5.1

 Description:
 ============
@ -15,11 +15,12 @@ as well as the command line tools 'cwebp' and 'dwebp'.

 See http://developers.google.com/speed/webp

-Latest sources are available from http://www.webmproject.org/code/
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp

 It is released under the same license as the WebM project.
 See http://www.webmproject.org/license/software/ or the
-file "COPYING" file for details. An additional intellectual
+"COPYING" file for details. An additional intellectual
 property rights grant can be found in the file PATENTS.

 Building:
@ -53,6 +54,12 @@ Please refer to makefile.unix for additional details and customizations.

 Using autoconf tools:
 ---------------------
+Prerequisites:
+A compiler (e.g., gcc), make, autoconf, automake, libtool.
+On a Debian-like system the following should install everything you need for a
+minimal build:
+$ sudo apt-get install gcc make autoconf automake libtool
+
 When building from git sources, you will need to run autogen.sh to generate the
 configure script.

@ -77,6 +84,73 @@ be installed independently using a minor modification in the corresponding
 Makefile.am configure files (see comments there). See './configure --help' for
 more options.

+Building for MIPS Linux:
+------------------------
+MIPS Linux toolchain stable available releases can be found at:
+https://community.imgtec.com/developers/mips/tools/codescape-mips-sdk/available-releases/
+
+# Add toolchain to PATH
+export PATH=$PATH:/path/to/toolchain/bin
+
+# 32-bit build for mips32r5 (p5600)
+HOST=mips-mti-linux-gnu
+MIPS_CFLAGS="-O3 -mips32r5 -mabi=32 -mtune=p5600 -mmsa -mfp64 \
+  -msched-weight -mload-store-pairs -fPIE"
+MIPS_LDFLAGS="-mips32r5 -mabi=32 -mmsa -mfp64 -pie"
+
+# 64-bit build for mips64r6 (i6400)
+HOST=mips-img-linux-gnu
+MIPS_CFLAGS="-O3 -mips64r6 -mabi=64 -mtune=i6400 -mmsa -mfp64 \
+  -msched-weight -mload-store-pairs -fPIE"
+MIPS_LDFLAGS="-mips64r6 -mabi=64 -mmsa -mfp64 -pie"
+
+./configure --host=${HOST} --build=`config.guess` \
+  CC="${HOST}-gcc -EL" \
+  CFLAGS="$MIPS_CFLAGS" \
+  LDFLAGS="$MIPS_LDFLAGS"
+make
+make install
+
+CMake:
+------
+The support for CMake is minimal: it only helps you compile libwebp, cwebp and
+dwebp.
+
+Prerequisites:
+A compiler (e.g., gcc with autotools) and CMake.
+On a Debian-like system the following should install everything you need for a
+minimal build:
+$ sudo apt-get install build-essential cmake
+
+When building from git sources, you will need to run cmake to generate the
+configure script.
+
+mkdir build && cd build && cmake ../
+make
+make install
+
+If you also want cwebp or dwebp, you will need to enable them through CMake:
+
+cmake -DWEBP_BUILD_CWEBP=ON -DWEBP_BUILD_DWEBP=ON ../
+
+or through your favorite interface (like ccmake or cmake-qt-gui).
+
+Gradle:
+-------
+The support for Gradle is minimal: it only helps you compile libwebp, cwebp and
+dwebp and webpmux_example.
+
+Prerequisites:
+A compiler (e.g., gcc with autotools) and gradle.
+On a Debian-like system the following should install everything you need for a
+minimal build:
+$ sudo apt-get install build-essential gradle
+
+When building from git sources, you will need to run the Gradle wrapper with the
+appropriate target, e.g. :
+
+./gradlew buildAllExecutables
+
 SWIG bindings:
 --------------

@ -144,14 +218,16 @@ If input size (-s) for an image is not specified, it is
 assumed to be a PNG, JPEG, TIFF or WebP file.

 Options:
-  -h / -help  ............ short help
-  -H / -longhelp  ........ long help
+  -h / -help ............. short help
+  -H / -longhelp ......... long help
  -q <float> ............. quality factor (0:small..100:big)
  -alpha_q <int> ......... transparency-compression quality (0..100)
  -preset <string> ....... preset setting, one of:
                            default, photo, picture,
                            drawing, icon, text
     -preset must come first, as it overwrites other parameters
+  -z <int> ............... activates lossless preset with given
+                           level in [0:fast, ..., 9:slowest]

  -m <int> ............... compression method (0=fast, 6=slowest)
  -segments <int> ........ number of segments to use (1..4)
@ -179,13 +255,15 @@ Options:
  -alpha_method <int> .... transparency-compression method (0..1)
  -alpha_filter <string> . predictive filtering for alpha plane,
                           one of: none, fast (default) or best
-  -alpha_cleanup ......... clean RGB values in transparent area
+  -exact ................. preserve RGB values in transparent area
  -blend_alpha <hex> ..... blend colors against background color
                           expressed as RGB values written in
                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
                           green=0xe0 and blue=0xd0
  -noalpha ............... discard any transparency information
  -lossless .............. encode image losslessly
+  -near_lossless <int> ... use near-lossless image
+                           preprocessing (0..100=off)
  -hint <string> ......... specify image characteristics hint,
                           one of: photo, picture or graph

@ -263,18 +341,21 @@ Use following options to convert into alternate image formats:
  -yuv ......... save the raw YUV samples in flat layout

 Other options are:
-  -version  .... print version number and exit
+  -version ..... print version number and exit
  -nofancy ..... don't use the fancy YUV420 upscaler
  -nofilter .... disable in-loop filtering
  -nodither .... disable dithering
  -dither <d> .. dithering strength (in 0..100)
+  -alpha_dither  use alpha-plane dithering if needed
  -mt .......... use multi-threading
  -crop <x> <y> <w> <h> ... crop output with the given rectangle
-  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -resize <w> <h> ......... scale the output (*after* any cropping)
+  -flip ........ flip the output vertically
  -alpha ....... only save the alpha plane
  -incremental . use incremental decoding (useful for tests)
-  -h     ....... this help message
-  -v     ....... verbose (e.g. print encoding/decoding times)
+  -h ........... this help message
+  -v ........... verbose (e.g. print encoding/decoding times)
+  -quiet ....... quiet mode, don't print anything
  -noasm ....... disable all assembly optimizations

 Visualization tool:
@ -289,14 +370,15 @@ Usage: vwebp in_file [options]

 Decodes the WebP image file and visualize it using OpenGL
 Options are:
-  -version  .... print version number and exit
+  -version ..... print version number and exit
  -noicc ....... don't use the icc profile if present
  -nofancy ..... don't use the fancy YUV420 upscaler
  -nofilter .... disable in-loop filtering
  -dither <int>  dithering strength (0..100), default=50
+  -noalphadither disable alpha plane dithering
  -mt .......... use multi-threading
  -info ........ print info
-  -h     ....... this help message
+  -h ........... this help message

 Keyboard shortcuts:
  'c' ................ toggle use of color profile
@ -338,12 +420,16 @@ vwebp.
 Usage:
 gif2webp [options] gif_file -o webp_file
 Options:
-  -h / -help  ............ this help
+  -h / -help ............. this help
  -lossy ................. encode image using lossy compression
  -mixed ................. for each frame in the image, pick lossy
                           or lossless compression heuristically
  -q <float> ............. quality factor (0:small..100:big)
  -m <int> ............... compression method (0=fast, 6=slowest)
+  -min_size .............. minimize output size (default:off)
+                           lossless compression by default; can be
+                           combined with -q, -m, -lossy or -mixed
+                           options
  -kmin <int> ............ min distance between key frames
  -kmax <int> ............ max distance between key frames
  -f <int> ............... filter strength (0=off..100)
@ -366,6 +452,29 @@ or using autoconf:
 $ ./configure --enable-everything
 $ make

+Comparison of animated images:
+==============================
+Test utility anim_diff under examples/ can be used to compare two animated
+images (each can be GIF or WebP).
+
+Usage: anim_diff <image1> <image2> [options]
+
+Options:
+  -dump_frames <folder> dump decoded frames in PAM format
+  -min_psnr <float> ... minimum per-frame PSNR
+  -raw_comparison ..... if this flag is not used, RGB is
+                        premultiplied before comparison
+
+Building:
+---------
+With the libgif development files and a C++ compiler installed, anim_diff can
+be built using makefile.unix:
+$ make -f makefile.unix examples/anim_diff
+
+or using autoconf:
+$ ./configure --enable-everything
+$ make
+
 Encoding API:
 =============

@ -595,8 +704,8 @@ an otherwise too-large picture. Some CPU can be saved too, incidentally.
 Bugs:
 =====

-Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/README.mux
+++ b/README.mux
@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.2
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.3.1


 Description:
@ -133,7 +133,7 @@ WebP files. This API currently supports reading of XMP/EXIF metadata, ICC
 profile and animated images. Other features may be added in subsequent
 releases.

-Code Example: Demuxing WebP data to extract all the frames, ICC profile
+Code example: Demuxing WebP data to extract all the frames, ICC profile
 and EXIF/XMP metadata.

  WebPDemuxer* demux = WebPDemux(&webp_data);
@ -170,12 +170,36 @@ and EXIF/XMP metadata.
 For a detailed Demux API reference, please refer to the header file
 (src/webp/demux.h).

+AnimEncoder API:
+================
+The AnimEncoder API can be used to create animated WebP images.
+
+Code example:
+
+  WebPAnimEncoderOptions enc_options;
+  WebPAnimEncoderOptionsInit(&enc_options);
+  // ... (Tune 'enc_options' as needed).
+  WebPAnimEncoder* enc = WebPAnimEncoderNew(width, height, &enc_options);
+  while(<there are more frames>) {
+    WebPConfig config;
+    WebPConfigInit(&config);
+    // ... (Tune 'config' as needed).
+    WebPAnimEncoderAdd(enc, frame, duration, &config);
+  }
+  WebPAnimEncoderAssemble(enc, webp_data);
+  WebPAnimEncoderDelete(enc);
+  // ... (Write the 'webp_data' to a file, or re-mux it further).
+
+
+For a detailed AnimEncoder API reference, please refer to the header file
+(src/webp/mux.h).
+

 Bugs:
 =====

-Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/build.gradle
+++ b/build.gradle
@ -0,0 +1,337 @@
+// Define dependencies.
+buildscript {
+  repositories {
+    maven {
+      url "https://jcenter.bintray.com"
+    }
+  }
+  dependencies {
+    classpath "com.android.tools.build:gradle:${ANDROID_GRADLE_PLUGIN_VERSION}"
+  }
+}
+
+// Define versions in the project.
+project.ext {
+  buildToolsVersion = "${BUILD_TOOLS_VERSION}"
+  compileSdkVersion = COMPILE_SDK_VERSION.toInteger()
+}
+
+// Core libraries and executables.
+apply plugin: "c"
+def NEON
+model {
+  buildTypes {
+    debug
+    release
+  }
+  platforms {
+    arm {
+      architecture "arm"
+    }
+    arm64 {
+      architecture "arm64"
+    }
+    x86 {
+      architecture "x86"
+    }
+    x64 {
+      architecture "x86_64"
+    }
+    mips32r2
+    mips32r5
+    mips64r6
+  }
+  toolChains {
+    gcc(Gcc) {
+      target("mips32r2") {
+        cCompiler.args "-mips32r2"
+      }
+      target("mips32r5") {
+        cCompiler.args "-mips32r5"
+      }
+      target("mips64r6") {
+        cCompiler.args "-mips64r6"
+      }
+    }
+  }
+  binaries {
+    all {
+      if (toolChain in Gcc) {
+        cCompiler.args "-fPIC"
+        cCompiler.args "-Wall"
+        cCompiler.define "ANDROID"
+        cCompiler.define "HAVE_MALLOC_H"
+      }
+      // Optimizations.
+      if (buildType == buildTypes.release) {
+        if (toolChain in Gcc) {
+          cCompiler.args "-finline-functions"
+          cCompiler.args "-ffast-math"
+          cCompiler.args "-ffunction-sections"
+          cCompiler.args "-fdata-sections"
+        }
+        if (toolChain in Clang) {
+          cCompiler.args "-frename-registers -s"
+        }
+      }
+      // Check for NEON usage.
+      if (getTargetPlatform() == "arm" || getTargetPlatform() == "arm64") {
+        NEON = "c.neon"
+      } else {
+        NEON = "c"
+      }
+    }
+    // Link to pthread for shared libraries.
+    withType(SharedLibraryBinarySpec) {
+      if (toolChain in Gcc) {
+        cCompiler.define "HAVE_PTHREAD"
+        cCompiler.define "WEBP_USE_THREAD"
+        linker.args "-pthread"
+      }
+    }
+  }
+  components {
+    webp(NativeLibrarySpec) {
+      sources {
+        c {
+          source {
+            srcDir "src/dec"
+            include "alpha.c"
+            include "buffer.c"
+            include "frame.c"
+            include "idec.c"
+            include "io.c"
+            include "quant.c"
+            include "tree.c"
+            include "vp8.c"
+            include "vp8l.c"
+            include "webp.c"
+            srcDir "src/dsp"
+            include "alpha_processing.c"
+            include "alpha_processing_mips_dsp_r2.c"
+            include "alpha_processing_sse2.c"
+            include "alpha_processing_sse41.c"
+            include "argb.c"
+            include "argb_mips_dsp_r2.c"
+            include "argb_sse2.c"
+            include "cpu.c"
+            include "dec.c"
+            include "dec_clip_tables.c"
+            include "dec_mips32.c"
+            include "dec_mips_dsp_r2.c"
+            include "dec_msa.c"
+            include "dec_neon.$NEON"
+            include "dec_sse2.c"
+            include "dec_sse41.c"
+            include "filters.c"
+            include "filters_mips_dsp_r2.c"
+            include "filters_sse2.c"
+            include "lossless.c"
+            include "lossless_mips_dsp_r2.c"
+            include "lossless_neon.$NEON"
+            include "lossless_sse2.c"
+            include "rescaler.c"
+            include "rescaler_mips32.c"
+            include "rescaler_mips_dsp_r2.c"
+            include "rescaler_neon.$NEON"
+            include "rescaler_sse2.c"
+            include "upsampling.c"
+            include "upsampling_mips_dsp_r2.c"
+            include "upsampling_neon.$NEON"
+            include "upsampling_sse2.c"
+            include "yuv.c"
+            include "yuv_mips32.c"
+            include "yuv_mips_dsp_r2.c"
+            include "yuv_sse2.c"
+            srcDir "src/utils"
+            include "ans.c"
+            include "bit_reader.c"
+            include "color_cache.c"
+            include "filters.c"
+            include "huffman.c"
+            include "quant_levels_dec.c"
+            include "random.c"
+            include "rescaler.c"
+            include "thread.c"
+            include "utils.c"
+            srcDir "src/dsp"
+            include "cost.c"
+            include "cost_mips32.c"
+            include "cost_mips_dsp_r2.c"
+            include "cost_sse2.c"
+            include "enc.c"
+            include "enc_avx2.c"
+            include "enc_mips32.c"
+            include "enc_mips_dsp_r2.c"
+            include "enc_neon.$NEON"
+            include "enc_sse2.c"
+            include "enc_sse41.c"
+            include "lossless_enc.c"
+            include "lossless_enc_mips32.c"
+            include "lossless_enc_mips_dsp_r2.c"
+            include "lossless_enc_neon.$NEON"
+            include "lossless_enc_sse2.c"
+            include "lossless_enc_sse41.c"
+            srcDir "src/enc"
+            include "alpha.c"
+            include "analysis.c"
+            include "backward_references.c"
+            include "config.c"
+            include "cost.c"
+            include "delta_palettization.c"
+            include "filter.c"
+            include "frame.c"
+            include "histogram.c"
+            include "iterator.c"
+            include "near_lossless.c"
+            include "picture.c"
+            include "picture_csp.c"
+            include "picture_psnr.c"
+            include "picture_rescale.c"
+            include "picture_tools.c"
+            include "quant.c"
+            include "syntax.c"
+            include "token.c"
+            include "tree.c"
+            include "vp8l.c"
+            include "webpenc.c"
+            srcDir "src/utils"
+            include "bit_writer.c"
+            include "huffman_encode.c"
+            include "quant_levels.c"
+          }
+          exportedHeaders {
+            srcDir "src"
+          }
+        }
+      }
+    }
+
+    webpdemux(NativeLibrarySpec) {
+      sources {
+        c {
+          source {
+            srcDir "src/demux"
+            include "anim_decode.c"
+            include "demux.c"
+          }
+        }
+      }
+    }
+
+    webpmux(NativeLibrarySpec) {
+      sources {
+        c {
+          source {
+            srcDir "src/mux/"
+            include "anim_encode.c"
+            include "muxedit.c"
+            include "muxinternal.c"
+            include "muxread.c"
+          }
+        }
+      }
+    }
+
+    // Executables from examples.
+    example_util(NativeLibrarySpec) {
+      binaries {
+        all {
+          lib library: "webp", linkage: "static"
+        }
+      }
+      sources {
+        c {
+          source {
+            srcDir "./examples"
+            include "example_util.c"
+          }
+        }
+      }
+    }
+
+    example_dec(NativeLibrarySpec) {
+      binaries {
+        all {
+          lib library: "webp", linkage: "static"
+        }
+      }
+      sources {
+        c {
+          source {
+            srcDir "./examples"
+            include "image_dec.c"
+            include "jpegdec.c"
+            include "metadata.c"
+            include "pngdec.c"
+            include "tiffdec.c"
+            include "webpdec.c"
+          }
+        }
+      }
+    }
+    cwebp(NativeExecutableSpec) {
+      binaries {
+        all {
+          lib library: "example_util", linkage: "static"
+          lib library: "example_dec", linkage: "static"
+          lib library: "webp", linkage: "static"
+        }
+      }
+      sources {
+        c {
+          source {
+            srcDir "./examples"
+            include "cwebp.c"
+          }
+        }
+      }
+    }
+
+    dwebp(NativeExecutableSpec) {
+      binaries {
+        all {
+          lib library: "example_util", linkage: "static"
+          lib library: "webp"
+        }
+      }
+      sources {
+        c {
+          source {
+              srcDir "./examples"
+              include "dwebp.c"
+          }
+        }
+      }
+    }
+
+    webpmux_example(NativeExecutableSpec) {
+      binaries {
+        all {
+          lib library: "example_util", linkage: "static"
+          lib library: "webpmux", linkage: "static"
+          lib library: "webp"
+        }
+      }
+      sources {
+        c {
+          source {
+            srcDir "./examples"
+            include "webpmux.c"
+          }
+        }
+      }
+    }
+  }
+  tasks {
+    // Task to test all possible configurations.
+    buildAllExecutables(Task) {
+      dependsOn $.binaries.findAll { it.buildable }
+    }
+  }
+}
+
+// Task to generate the wrapper.
+task wrapper(type: Wrapper) {
+  gradleVersion = '2.13'
+}
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -0,0 +1,146 @@
+/* Adapted from the autotools src/webp/config.h.in.  */
+
+/* Define if building universal (internal helper macro) */
+/* TODO: handle properly in CMake */
+#cmakedefine AC_APPLE_UNIVERSAL_BUILD 1
+
+/* Set to 1 if __builtin_bswap16 is available */
+#cmakedefine HAVE_BUILTIN_BSWAP16 1
+
+/* Set to 1 if __builtin_bswap32 is available */
+#cmakedefine HAVE_BUILTIN_BSWAP32 1
+
+/* Set to 1 if __builtin_bswap64 is available */
+#cmakedefine HAVE_BUILTIN_BSWAP64 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#cmakedefine HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <GLUT/glut.h> header file. */
+#cmakedefine HAVE_GLUT_GLUT_H 1
+
+/* Define to 1 if you have the <GL/glut.h> header file. */
+#cmakedefine HAVE_GL_GLUT_H 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#cmakedefine HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#cmakedefine HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <OpenGL/glut.h> header file. */
+#cmakedefine HAVE_OPENGL_GLUT_H 1
+
+/* Have PTHREAD_PRIO_INHERIT. */
+#cmakedefine HAVE_PTHREAD_PRIO_INHERIT @HAVE_PTHREAD_PRIO_INHERIT@
+
+/* Define to 1 if you have the <shlwapi.h> header file. */
+#cmakedefine HAVE_SHLWAPI_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#cmakedefine HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#cmakedefine HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#cmakedefine HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#cmakedefine HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#cmakedefine HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#cmakedefine HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the <wincodec.h> header file. */
+#cmakedefine HAVE_WINCODEC_H 1
+
+/* Define to 1 if you have the <windows.h> header file. */
+#cmakedefine HAVE_WINDOWS_H 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+/* TODO: handle properly in CMake */
+#cmakedefine LT_OBJDIR "@LT_OBJDIR@"
+
+/* Name of package */
+#cmakedefine PACKAGE "@PROJECT_NAME@"
+
+/* Define to the address where bug reports for this package should be sent. */
+#cmakedefine PACKAGE_BUGREPORT "@PACKAGE_BUGREPORT@"
+
+/* Define to the full name of this package. */
+#cmakedefine PACKAGE_NAME "@PACKAGE_NAME@"
+
+/* Define to the full name and version of this package. */
+#cmakedefine PACKAGE_STRING "@PACKAGE_STRING@"
+
+/* Define to the one symbol short name of this package. */
+#cmakedefine PACKAGE_TARNAME "@PACKAGE_TARNAME@"
+
+/* Define to the home page for this package. */
+#cmakedefine PACKAGE_URL "@PACKAGE_URL@"
+
+/* Define to the version of this package. */
+#cmakedefine PACKAGE_VERSION "@PACKAGE_VERSION@"
+
+/* Define to necessary symbol if this constant uses a non-standard name on
+   your system. */
+#cmakedefine PTHREAD_CREATE_JOINABLE 1
+
+/* Define to 1 if you have the ANSI C header files. */
+#cmakedefine STDC_HEADERS 1
+
+/* Version number of package */
+#cmakedefine VERSION "@VERSION@"
+
+/* Enable experimental code */
+#cmakedefine WEBP_EXPERIMENTAL_FEATURES 1
+
+/* Define to 1 to force aligned memory operations */
+#cmakedefine WEBP_FORCE_ALIGNED 1
+
+/* Set to 1 if AVX2 is supported */
+#cmakedefine WEBP_HAVE_AVX2 1
+
+/* Set to 1 if GIF library is installed */
+#cmakedefine WEBP_HAVE_GIF 1
+
+/* Set to 1 if OpenGL is supported */
+#cmakedefine WEBP_HAVE_GL 1
+
+/* Set to 1 if JPEG library is installed */
+#cmakedefine WEBP_HAVE_JPEG 1
+
+/* Set to 1 if PNG library is installed */
+#cmakedefine WEBP_HAVE_PNG 1
+
+/* Set to 1 if SSE2 is supported */
+#cmakedefine WEBP_HAVE_SSE2 1
+
+/* Set to 1 if SSE4.1 is supported */
+#cmakedefine WEBP_HAVE_SSE41 1
+
+/* Set to 1 if TIFF library is installed */
+#cmakedefine WEBP_HAVE_TIFF 1
+
+/* Undefine this to disable thread support. */
+#cmakedefine WEBP_USE_THREAD 1
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+#  undef WORDS_BIGENDIAN
+# endif
+#endif
--- a/configure.ac
+++ b/configure.ac
@ -1,5 +1,5 @@
-AC_INIT([libwebp], [0.4.3],
-        [http://code.google.com/p/webp/issues],,
+AC_INIT([libwebp], [0.5.1],
+        [https://bugs.chromium.org/p/webp],,
        [http://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
 AC_PREREQ([2.60])
@ -28,8 +28,21 @@ AC_ARG_ENABLE([everything],
                              disabled with --disable-target]),
              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
+               SET_IF_UNSET([enable_libwebpextras], [$enableval])
               SET_IF_UNSET([enable_libwebpmux], [$enableval])])

+dnl === If --enable-asserts is not defined, define NDEBUG
+
+AC_MSG_CHECKING(whether asserts are enabled)
+AC_ARG_ENABLE([asserts],
+              AS_HELP_STRING([--enable-asserts],
+                             [Enable assert checks]))
+if test "x${enable_asserts-no}" = "xno"; then
+  AM_CPPFLAGS="${AM_CPPFLAGS} -DNDEBUG"
+fi
+AC_MSG_RESULT(${enable_asserts-no})
+AC_SUBST([AM_CPPFLAGS])
+
 AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=DIR],
            [Path to the pkgconfig directory @<:@LIBDIR/pkgconfig@:>@]),
            [pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
@ -51,15 +64,19 @@ AC_DEFUN([TEST_AND_ADD_CFLAGS],
                            [$1="${$1} $2"],
                            [AC_MSG_RESULT([no])])
          CFLAGS="$SAVED_CFLAGS"])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-fvisibility=hidden])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
-TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-nonliteral])
-TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-security])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wfloat-conversion])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-nonliteral])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wvla])
@ -76,36 +93,145 @@ AS_IF([test "$GCC" = "yes" ], [
       esac
       AS_IF([test "$gcc_wht_bug" = "yes"], [
              TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-frename-registers])])])
+# Use -flax-vector-conversions, if available, when building intrinsics with
+# older versions of gcc. The flag appeared in 4.3.x, but if backported, and
+# -fno-lax-vector-conversions is set, errors may occur with the intrinsics
+# files along with the older system includes, e.g., emmintrin.h.
+# Originally observed with cc (GCC) 4.2.1 20070831 patched [FreeBSD] (9.3).
+# https://bugs.chromium.org/p/webp/issues/detail?id=274
+AS_IF([test "$GCC" = "yes" ], [
+       case "$host_cpu" in
+         amd64|i?86|x86_64)
+           AC_COMPILE_IFELSE(
+             dnl only check for -flax-vector-conversions with older gcc, skip
+             dnl clang as it reports itself as 4.2.1, but the flag isn't needed.
+             [AC_LANG_SOURCE([#if !defined(__clang__) && defined(__GNUC__) && \
+                                  ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x403
+                              #error old gcc
+                              #endif
+                              int main(void) { return 0; }
+                             ])],,
+              [TEST_AND_ADD_CFLAGS([INTRINSICS_CFLAGS],
+                                   [-flax-vector-conversions])])
+           ;;
+       esac])
 AC_SUBST([AM_CFLAGS])

 dnl === Check for machine specific flags
-TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
-AS_IF([test -n "$AVX2_FLAGS"], [
-  SAVED_CFLAGS=$CFLAGS
-  CFLAGS="$CFLAGS $AVX2_FLAGS"
-  AC_CHECK_HEADER([immintrin.h],
-                  [AC_DEFINE(WEBP_HAVE_AVX2, [1],
-                   [Set to 1 if AVX2 is supported])],
-                  [AVX2_FLAGS=""],
-                  dnl it's illegal to directly include avx2intrin.h, but it's
-                  dnl included conditionally in immintrin.h, tricky!
-                  [#ifndef __AVX2__
-                   #error avx2 is not enabled
-                   #endif
-                  ])
-  CFLAGS=$SAVED_CFLAGS])
-AC_SUBST([AVX2_FLAGS])
+AC_ARG_ENABLE([avx2],
+              AS_HELP_STRING([--disable-avx2],
+                             [Disable detection of AVX2 support
+                              @<:@default=auto@:>@]))

-TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2])
-AS_IF([test -n "$SSE2_FLAGS"], [
-  SAVED_CFLAGS=$CFLAGS
-  CFLAGS="$CFLAGS $SSE2_FLAGS"
-  AC_CHECK_HEADER([emmintrin.h],
-                  [AC_DEFINE(WEBP_HAVE_SSE2, [1],
-                   [Set to 1 if SSE2 is supported])],
-                  [SSE2_FLAGS=""])
-  CFLAGS=$SAVED_CFLAGS])
-AC_SUBST([SSE2_FLAGS])
+AS_IF([test "x$enable_avx2" != "xno" -a "x$enable_sse4_1" != "xno" \
+         -a "x$enable_sse2" != "xno"], [
+  AVX2_CFLAGS="$INTRINSICS_CFLAGS $AVX2_FLAGS"
+  TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
+  AS_IF([test -n "$AVX2_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $AVX2_FLAGS"
+    AC_CHECK_HEADER([immintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_AVX2, [1],
+                     [Set to 1 if AVX2 is supported])],
+                    [AVX2_FLAGS=""],
+                    dnl it's illegal to directly include avx2intrin.h, but it's
+                    dnl included conditionally in immintrin.h, tricky!
+                    [#ifndef __AVX2__
+                     #error avx2 is not enabled
+                     #endif
+                    ])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([AVX2_FLAGS])])
+
+AC_ARG_ENABLE([sse4.1],
+              AS_HELP_STRING([--disable-sse4.1],
+                             [Disable detection of SSE4.1 support
+                              @<:@default=auto@:>@]))
+
+AS_IF([test "x$enable_sse4_1" != "xno" -a "x$enable_sse2" != "xno"], [
+  SSE41_FLAGS="$INTRINSICS_CFLAGS $SSE41_FLAGS"
+  TEST_AND_ADD_CFLAGS([SSE41_FLAGS], [-msse4.1])
+  AS_IF([test -n "$SSE41_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $SSE41_FLAGS"
+    AC_CHECK_HEADER([smmintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_SSE41, [1],
+                     [Set to 1 if SSE4.1 is supported])],
+                    [SSE41_FLAGS=""])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([SSE41_FLAGS])])
+
+AC_ARG_ENABLE([sse2],
+              AS_HELP_STRING([--disable-sse2],
+                             [Disable detection of SSE2 support
+                              @<:@default=auto@:>@]))
+
+AS_IF([test "x$enable_sse2" != "xno"], [
+  SSE2_FLAGS="$INTRINSICS_CFLAGS $SSE2_FLAGS"
+  TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2])
+  AS_IF([test -n "$SSE2_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $SSE2_FLAGS"
+    AC_CHECK_HEADER([emmintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_SSE2, [1],
+                     [Set to 1 if SSE2 is supported])],
+                    [SSE2_FLAGS=""])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([SSE2_FLAGS])])
+
+AC_ARG_ENABLE([neon],
+              AS_HELP_STRING([--disable-neon],
+                             [Disable detection of NEON support
+                              @<:@default=auto@:>@]))
+
+AC_ARG_ENABLE([neon_rtcd],
+              AS_HELP_STRING([--disable-neon-rtcd],
+                             [Disable runtime detection of NEON support via
+                              /proc/cpuinfo on Linux hosts
+                              @<:@default=auto@:>@]))
+# For ARM(7) hosts:
+# Both NEON flags unset and NEON support detected = build all modules with NEON
+# NEON detected with the use of -mfpu=neon = build only NEON modules with NEON
+AS_IF([test "x$enable_neon" != "xno"], [
+  case "$host_cpu" in
+    arm|armv7*)
+      dnl Test for NEON support with no flags.
+      AC_CHECK_HEADER([arm_neon.h],
+                      [AC_DEFINE(WEBP_HAVE_NEON, [1],
+                       [Set to 1 if NEON is supported])],
+                      dnl Test for NEON support using -mfpu=neon
+                      [unset ac_cv_header_arm_neon_h
+                       NEON_FLAGS="$INTRINSICS_CFLAGS $NEON_FLAGS"
+                       TEST_AND_ADD_CFLAGS([NEON_FLAGS], [-mfpu=neon])
+                       AS_IF([test -n "$NEON_FLAGS"], [
+                         SAVED_CFLAGS=$CFLAGS
+                         CFLAGS="$CFLAGS $NEON_FLAGS"
+                         AC_CHECK_HEADER([arm_neon.h],
+                           [AS_IF([test "${host_os%%-*}" = "linux" -o \
+                                        "x$enable_neon_rtcd" = "xno"], [
+                             AC_DEFINE(WEBP_HAVE_NEON, [1],
+                                       [Set to 1 if NEON is supported])],
+                                       [AC_MSG_WARN(m4_normalize([NEON runtime
+                                          cpu-detection is unavailble for
+                                          ${host_os%%-*}. Force with
+                                          CFLAGS=-mfpu=neon or
+                                          --disable-neon-rtcd.]))
+                                        enable_neon_rtcd=no
+                                        NEON_FLAGS=""])],
+                           [NEON_FLAGS=""])
+                         CFLAGS=$SAVED_CFLAGS
+                         AS_IF([test -n "$NEON_FLAGS"], [
+                           dnl If NEON is available and rtcd is disabled apply
+                           dnl  NEON_FLAGS globally.
+                           AS_IF([test "x$enable_neon_rtcd" = "xno"], [
+                             AM_CFLAGS="$AM_CFLAGS $NEON_FLAGS"
+                             NEON_FLAGS=""],
+                             [AC_DEFINE(WEBP_HAVE_NEON_RTCD, [1],
+                                        [Set to 1 if runtime detection of NEON
+                                         is enabled])])])])])
+        ;;
+    esac
+    AC_SUBST([NEON_FLAGS])])

 dnl === CLEAR_LIBVARS([var_pfx])
 dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
@ -150,7 +276,7 @@ dnl ===   AC_DEFINE'ing <define> if successful.
 AC_DEFUN([CHECK_FOR_BUILTIN],
         [AC_LANG_PUSH([C])
          AC_MSG_CHECKING([for $1])
-          AC_LINK_IFELSE([AC_LANG_PROGRAM([], [$1($2)])],
+          AC_LINK_IFELSE([AC_LANG_PROGRAM([], [(void)$1($2)])],
                         [AC_MSG_RESULT([yes])
                          AC_DEFINE([$3], [1],
                                    [Set to 1 if $1 is available])],
@ -405,11 +531,17 @@ AS_IF([test "x$enable_gif" != "xno"], [
  )
  LIBCHECK_EPILOGUE([GIF])

+  if test "$gif_support" = "yes" -a \
+          "$enable_libwebpdemux" = "yes"; then
+    build_animdiff=yes
+  fi
+
  if test "$gif_support" = "yes" -a \
          "$enable_libwebpmux" = "yes"; then
    build_gif2webp=yes
  fi
 ])
+AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_animdiff}" = "yes"])
 AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 dnl === check for WIC support ===
@ -527,6 +659,14 @@ AC_ARG_ENABLE([libwebpdecoder],
 AC_MSG_RESULT(${enable_libwebpdecoder-no})
 AM_CONDITIONAL([BUILD_LIBWEBPDECODER], [test "$enable_libwebpdecoder" = "yes"])

+dnl === Check whether libwebpextras should be built
+AC_MSG_CHECKING(whether libwebpextras is to be built)
+AC_ARG_ENABLE([libwebpextras],
+              AS_HELP_STRING([--enable-libwebpextras],
+                             [Build libwebpextras @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpextras-no})
+AM_CONDITIONAL([WANT_EXTRAS], [test "$enable_libwebpextras" = "yes"])
+
 dnl =========================

 AC_CONFIG_MACRO_DIR([m4])
@ -535,9 +675,10 @@ AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile src/dec/Makefile \
                 src/enc/Makefile src/dsp/Makefile \
                 src/demux/Makefile src/mux/Makefile \
-                 src/utils/Makefile \
+                 src/utils/Makefile src/extras/Makefile \
                 src/libwebp.pc src/libwebpdecoder.pc \
-                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc])
+                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc \
+                 src/extras/libwebpextras.pc])


 AC_OUTPUT
@ -553,6 +694,7 @@ libwebp: yes
 libwebpdecoder: ${enable_libwebpdecoder-no}
 libwebpdemux: ${enable_libwebpdemux-no}
 libwebpmux: ${enable_libwebpmux-no}
+libwebpextras: ${enable_libwebpextras-no}

 Tools:
 cwebp : yes
@ -568,6 +710,7 @@ dwebp : yes
  PNG  : ${png_support-no}
  WIC  : ${wic_support-no}
 GIF support : ${gif_support-no}
+anim_diff   : ${build_animdiff-no}
 gif2webp    : ${build_gif2webp-no}
 webpmux     : ${enable_libwebpmux-no}
 vwebp       : ${build_vwebp-no}
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -271,9 +271,15 @@ An extended format file consists of:

  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_

-For a _still image_, the _image data_ consists of a single frame, whereas for
-an _animated image_, it consists of multiple frames. More details about frames
-can be found in the [Animation](#animation) section.
+For a _still image_, the _image data_ consists of a single frame, which is made
+up of:
+
+  * An optional [alpha subchunk](#alpha).
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l).
+
+For an _animated image_, the _image data_ consists of multiple frames. More
+details about frames can be found in the [Animation](#animation) section.

 All chunks SHOULD be placed in the same order as listed above. If a chunk
 appears in the wrong place, the file is invalid, but readers MAY parse the
@ -809,7 +815,7 @@ RIFF/WEBP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 [vp8spec]:  http://tools.ietf.org/html/rfc6386
-[webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[webpllspec]: https://chromium.googlesource.com/webm/libwebp/+/master/doc/webp-lossless-bitstream-spec.txt
 [iccspec]: http://www.color.org/icc_specs2.xalter
 [metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
 [rfc 1166]: http://tools.ietf.org/html/rfc1166
--- a/examples/Android.mk
+++ b/examples/Android.mk
@ -15,6 +15,27 @@ LOCAL_MODULE := example_util

 include $(BUILD_STATIC_LIBRARY)

+
+################################################################################
+# libexample_dec
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    image_dec.c \
+    jpegdec.c \
+    metadata.c \
+    pngdec.c \
+    tiffdec.c \
+    webpdec.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+
+LOCAL_MODULE := example_dec
+
+include $(BUILD_STATIC_LIBRARY)
+
 ################################################################################
 # cwebp

@ -24,15 +45,10 @@ include $(CLEAR_VARS)
 # minor modification to their Android.mk files.
 LOCAL_SRC_FILES := \
    cwebp.c \
-    jpegdec.c \
-    metadata.c \
-    pngdec.c \
-    tiffdec.c \
-    webpdec.c \

 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
-LOCAL_STATIC_LIBRARIES := example_util webp
+LOCAL_STATIC_LIBRARIES := example_util example_dec webp

 LOCAL_MODULE := cwebp

--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,4 +1,4 @@
-AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_srcdir)/src
+AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
 if BUILD_VWEBP
@ -12,27 +12,41 @@ if BUILD_GIF2WEBP
  bin_PROGRAMS += gif2webp
 endif

-noinst_LTLIBRARIES = libexampleutil.la
+noinst_LTLIBRARIES = libexampleutil.la libexampledec.la

 libexampleutil_la_SOURCES = example_util.c example_util.h stopwatch.h

+libexampledec_la_SOURCES  = image_dec.c image_dec.h
+libexampledec_la_SOURCES += jpegdec.c jpegdec.h
+libexampledec_la_SOURCES += metadata.c metadata.h
+libexampledec_la_SOURCES += pngdec.c pngdec.h
+libexampledec_la_SOURCES += tiffdec.c tiffdec.h
+libexampledec_la_SOURCES += webpdec.c webpdec.h
+libexampledec_la_SOURCES += wicdec.c wicdec.h
+libexampledec_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
+libexampledec_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+
+if BUILD_ANIMDIFF
+  noinst_PROGRAMS = anim_diff
+endif
+
+anim_diff_SOURCES = anim_diff.c anim_util.c anim_util.h
+anim_diff_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
+anim_diff_LDADD  = ../src/demux/libwebpdemux.la
+anim_diff_LDADD += libexampleutil.la
+anim_diff_LDADD += $(GIF_LIBS) -lm
+
 dwebp_SOURCES = dwebp.c stopwatch.h
 dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
 dwebp_LDADD = libexampleutil.la $(PNG_LIBS) $(JPEG_LIBS)

-cwebp_SOURCES  = cwebp.c metadata.c metadata.h stopwatch.h
-cwebp_SOURCES += jpegdec.c jpegdec.h
-cwebp_SOURCES += pngdec.c pngdec.h
-cwebp_SOURCES += tiffdec.c tiffdec.h
-cwebp_SOURCES += webpdec.c webpdec.h
-cwebp_SOURCES += wicdec.c wicdec.h
+cwebp_SOURCES  = cwebp.c stopwatch.h
 cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
-cwebp_LDADD  = libexampleutil.la ../src/libwebp.la
+cwebp_LDADD  = libexampleutil.la libexampledec.la ../src/libwebp.la
 cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

-gif2webp_SOURCES = gif2webp.c gif2webp_util.c
+gif2webp_SOURCES = gif2webp.c gifdec.c gifdec.h
 gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
 gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
 gif2webp_LDADD += $(GIF_LIBS)
@ -46,9 +60,11 @@ vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
 vwebp_LDADD = libexampleutil.la ../src/demux/libwebpdemux.la $(GL_LIBS)

 if BUILD_LIBWEBPDECODER
+  anim_diff_LDADD += ../src/libwebpdecoder.la
  dwebp_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
 else
+  anim_diff_LDADD += ../src/libwebp.la
  dwebp_LDADD += ../src/libwebp.la
  vwebp_LDADD += ../src/libwebp.la
 endif
--- a/examples/anim_diff.c
+++ b/examples/anim_diff.c
@ -0,0 +1,289 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Checks if given pair of animated GIF/WebP images are identical:
+// That is: their reconstructed canvases match pixel-by-pixel and their other
+// animation properties (loop count etc) also match.
+//
+// example: anim_diff foo.gif bar.webp
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>  // for 'strtod'.
+#include <string.h>  // for 'strcmp'.
+
+#include "./anim_util.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+// Returns true if 'a + b' will overflow.
+static int AdditionWillOverflow(int a, int b) {
+  return (b > 0) && (a > INT_MAX - b);
+}
+
+static int FramesAreEqual(const uint8_t* const rgba1,
+                          const uint8_t* const rgba2, int width, int height) {
+  const int stride = width * 4;  // Always true for 'DecodedFrame.rgba'.
+  return !memcmp(rgba1, rgba2, stride * height);
+}
+
+static WEBP_INLINE int PixelsAreSimilar(uint32_t src, uint32_t dst,
+                                        int max_allowed_diff) {
+  const int src_a = (src >> 24) & 0xff;
+  const int src_r = (src >> 16) & 0xff;
+  const int src_g = (src >> 8) & 0xff;
+  const int src_b = (src >> 0) & 0xff;
+  const int dst_a = (dst >> 24) & 0xff;
+  const int dst_r = (dst >> 16) & 0xff;
+  const int dst_g = (dst >> 8) & 0xff;
+  const int dst_b = (dst >> 0) & 0xff;
+
+  return (abs(src_r * src_a - dst_r * dst_a) <= (max_allowed_diff * 255)) &&
+         (abs(src_g * src_a - dst_g * dst_a) <= (max_allowed_diff * 255)) &&
+         (abs(src_b * src_a - dst_b * dst_a) <= (max_allowed_diff * 255)) &&
+         (abs(src_a - dst_a) <= max_allowed_diff);
+}
+
+static int FramesAreSimilar(const uint8_t* const rgba1,
+                            const uint8_t* const rgba2,
+                            int width, int height, int max_allowed_diff) {
+  int i, j;
+  assert(max_allowed_diff > 0);
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const int stride = width * 4;
+      const size_t offset = j * stride + i;
+      if (!PixelsAreSimilar(rgba1[offset], rgba2[offset], max_allowed_diff)) {
+        return 0;
+      }
+    }
+  }
+  return 1;
+}
+
+// Minimize number of frames by combining successive frames that have at max
+// 'max_diff' difference per channel between corresponding pixels.
+static void MinimizeAnimationFrames(AnimatedImage* const img, int max_diff) {
+  uint32_t i;
+  for (i = 1; i < img->num_frames; ++i) {
+    DecodedFrame* const frame1 = &img->frames[i - 1];
+    DecodedFrame* const frame2 = &img->frames[i];
+    const uint8_t* const rgba1 = frame1->rgba;
+    const uint8_t* const rgba2 = frame2->rgba;
+    int should_merge_frames = 0;
+    // If merging frames will result in integer overflow for 'duration',
+    // skip merging.
+    if (AdditionWillOverflow(frame1->duration, frame2->duration)) continue;
+    if (max_diff > 0) {
+      should_merge_frames = FramesAreSimilar(rgba1, rgba2, img->canvas_width,
+                                             img->canvas_height, max_diff);
+    } else {
+      should_merge_frames =
+          FramesAreEqual(rgba1, rgba2, img->canvas_width, img->canvas_height);
+    }
+    if (should_merge_frames) {  // Merge 'i+1'th frame into 'i'th frame.
+      frame1->duration += frame2->duration;
+      if (i + 1 < img->num_frames) {
+        memmove(&img->frames[i], &img->frames[i + 1],
+                (img->num_frames - i - 1) * sizeof(*img->frames));
+      }
+      --img->num_frames;
+      --i;
+    }
+  }
+}
+
+static int CompareValues(uint32_t a, uint32_t b, const char* output_str) {
+  if (a != b) {
+    fprintf(stderr, "%s: %d vs %d\n", output_str, a, b);
+    return 0;
+  }
+  return 1;
+}
+
+// Note: As long as frame durations and reconstructed frames are identical, it
+// is OK for other aspects like offsets, dispose/blend method to vary.
+static int CompareAnimatedImagePair(const AnimatedImage* const img1,
+                                    const AnimatedImage* const img2,
+                                    int premultiply,
+                                    double min_psnr) {
+  int ok = 1;
+  const int is_multi_frame_image = (img1->num_frames > 1);
+  uint32_t i;
+
+  ok = CompareValues(img1->canvas_width, img2->canvas_width,
+                     "Canvas width mismatch") && ok;
+  ok = CompareValues(img1->canvas_height, img2->canvas_height,
+                     "Canvas height mismatch") && ok;
+  ok = CompareValues(img1->num_frames, img2->num_frames,
+                     "Frame count mismatch") && ok;
+  if (!ok) return 0;  // These are fatal failures, can't proceed.
+
+  if (is_multi_frame_image) {  // Checks relevant for multi-frame images only.
+    ok = CompareValues(img1->loop_count, img2->loop_count,
+                       "Loop count mismatch") && ok;
+    ok = CompareValues(img1->bgcolor, img2->bgcolor,
+                       "Background color mismatch") && ok;
+  }
+
+  for (i = 0; i < img1->num_frames; ++i) {
+    // Pixel-by-pixel comparison.
+    const uint8_t* const rgba1 = img1->frames[i].rgba;
+    const uint8_t* const rgba2 = img2->frames[i].rgba;
+    int max_diff;
+    double psnr;
+    if (is_multi_frame_image) {  // Check relevant for multi-frame images only.
+      const char format[] = "Frame #%d, duration mismatch";
+      char tmp[sizeof(format) + 8];
+      ok = ok && (snprintf(tmp, sizeof(tmp), format, i) >= 0);
+      ok = ok && CompareValues(img1->frames[i].duration,
+                               img2->frames[i].duration, tmp);
+    }
+    GetDiffAndPSNR(rgba1, rgba2, img1->canvas_width, img1->canvas_height,
+                   premultiply, &max_diff, &psnr);
+    if (min_psnr > 0.) {
+      if (psnr < min_psnr) {
+        fprintf(stderr, "Frame #%d, psnr = %.2lf (min_psnr = %f)\n", i,
+                psnr, min_psnr);
+        ok = 0;
+      }
+    } else {
+      if (max_diff != 0) {
+        fprintf(stderr, "Frame #%d, max pixel diff: %d\n", i, max_diff);
+        ok = 0;
+      }
+    }
+  }
+  return ok;
+}
+
+static void Help(void) {
+  printf("Usage: anim_diff <image1> <image2> [options]\n");
+  printf("\nOptions:\n");
+  printf("  -dump_frames <folder> dump decoded frames in PAM format\n");
+  printf("  -min_psnr <float> ... minimum per-frame PSNR\n");
+  printf("  -raw_comparison ..... if this flag is not used, RGB is\n");
+  printf("                        premultiplied before comparison\n");
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  printf("  -max_diff <int> ..... maximum allowed difference per channel "
+         "                        between corresponding pixels in subsequent"
+         "                        frames\n");
+#endif
+}
+
+int main(int argc, const char* argv[]) {
+  int return_code = -1;
+  int dump_frames = 0;
+  const char* dump_folder = NULL;
+  double min_psnr = 0.;
+  int got_input1 = 0;
+  int got_input2 = 0;
+  int premultiply = 1;
+  int max_diff = 0;
+  int i, c;
+  const char* files[2] = { NULL, NULL };
+  AnimatedImage images[2];
+
+  if (argc < 3) {
+    Help();
+    return -1;
+  }
+
+  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
+    if (!strcmp(argv[c], "-dump_frames")) {
+      if (c < argc - 1) {
+        dump_frames = 1;
+        dump_folder = argv[++c];
+      } else {
+        parse_error = 1;
+      }
+    } else if (!strcmp(argv[c], "-min_psnr")) {
+      if (c < argc - 1) {
+        const char* const v = argv[++c];
+        char* end = NULL;
+        const double d = strtod(v, &end);
+        if (end == v) {
+          parse_error = 1;
+          fprintf(stderr, "Error! '%s' is not a floating point number.\n", v);
+        }
+        min_psnr = d;
+      } else {
+        parse_error = 1;
+      }
+    } else if (!strcmp(argv[c], "-raw_comparison")) {
+      premultiply = 0;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    } else if (!strcmp(argv[c], "-max_diff")) {
+      if (c < argc - 1) {
+        const char* const v = argv[++c];
+        char* end = NULL;
+        const int n = (int)strtol(v, &end, 10);
+        if (end == v) {
+          parse_error = 1;
+          fprintf(stderr, "Error! '%s' is not an integer.\n", v);
+        }
+        max_diff = n;
+      } else {
+        parse_error = 1;
+      }
+#endif
+    } else {
+      if (!got_input1) {
+        files[0] = argv[c];
+        got_input1 = 1;
+      } else if (!got_input2) {
+        files[1] = argv[c];
+        got_input2 = 1;
+      } else {
+        parse_error = 1;
+      }
+    }
+    if (parse_error) {
+      Help();
+      return -1;
+    }
+  }
+  if (!got_input2) {
+    Help();
+    return -1;
+  }
+
+  if (dump_frames) {
+    printf("Dumping decoded frames in: %s\n", dump_folder);
+  }
+
+  memset(images, 0, sizeof(images));
+  for (i = 0; i < 2; ++i) {
+    printf("Decoding file: %s\n", files[i]);
+    if (!ReadAnimatedImage(files[i], &images[i], dump_frames, dump_folder)) {
+      fprintf(stderr, "Error decoding file: %s\n Aborting.\n", files[i]);
+      return_code = -2;
+      goto End;
+    } else {
+      MinimizeAnimationFrames(&images[i], max_diff);
+    }
+  }
+
+  if (!CompareAnimatedImagePair(&images[0], &images[1],
+                                premultiply, min_psnr)) {
+    fprintf(stderr, "\nFiles %s and %s differ.\n", files[0], files[1]);
+    return_code = -3;
+  } else {
+    printf("\nFiles %s and %s are identical.\n", files[0], files[1]);
+    return_code = 0;
+  }
+ End:
+  ClearAnimatedImage(&images[0]);
+  ClearAnimatedImage(&images[1]);
+  return return_code;
+}
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@ -0,0 +1,755 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for animated images
+
+#include "./anim_util.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef WEBP_HAVE_GIF
+#include <gif_lib.h>
+#endif
+#include "webp/format_constants.h"
+#include "webp/decode.h"
+#include "webp/demux.h"
+#include "./example_util.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+static const int kNumChannels = 4;
+
+// -----------------------------------------------------------------------------
+// Common utilities.
+
+// Returns true if the frame covers the full canvas.
+static int IsFullFrame(int width, int height,
+                       int canvas_width, int canvas_height) {
+  return (width == canvas_width && height == canvas_height);
+}
+
+static int AllocateFrames(AnimatedImage* const image, uint32_t num_frames) {
+  uint32_t i;
+  const size_t rgba_size =
+      image->canvas_width * kNumChannels * image->canvas_height;
+  uint8_t* const mem = (uint8_t*)malloc(num_frames * rgba_size * sizeof(*mem));
+  DecodedFrame* const frames =
+      (DecodedFrame*)malloc(num_frames * sizeof(*frames));
+
+  if (mem == NULL || frames == NULL) {
+    free(mem);
+    free(frames);
+    return 0;
+  }
+  free(image->raw_mem);
+  image->num_frames = num_frames;
+  image->frames = frames;
+  for (i = 0; i < num_frames; ++i) {
+    frames[i].rgba = mem + i * rgba_size;
+    frames[i].duration = 0;
+    frames[i].is_key_frame = 0;
+  }
+  image->raw_mem = mem;
+  return 1;
+}
+
+void ClearAnimatedImage(AnimatedImage* const image) {
+  if (image != NULL) {
+    free(image->raw_mem);
+    free(image->frames);
+    image->num_frames = 0;
+    image->frames = NULL;
+    image->raw_mem = NULL;
+  }
+}
+
+// Clear the canvas to transparent.
+static void ZeroFillCanvas(uint8_t* rgba,
+                           uint32_t canvas_width, uint32_t canvas_height) {
+  memset(rgba, 0, canvas_width * kNumChannels * canvas_height);
+}
+
+// Clear given frame rectangle to transparent.
+static void ZeroFillFrameRect(uint8_t* rgba, int rgba_stride, int x_offset,
+                              int y_offset, int width, int height) {
+  int j;
+  assert(width * kNumChannels <= rgba_stride);
+  rgba += y_offset * rgba_stride + x_offset * kNumChannels;
+  for (j = 0; j < height; ++j) {
+    memset(rgba, 0, width * kNumChannels);
+    rgba += rgba_stride;
+  }
+}
+
+// Copy width * height pixels from 'src' to 'dst'.
+static void CopyCanvas(const uint8_t* src, uint8_t* dst,
+                       uint32_t width, uint32_t height) {
+  assert(src != NULL && dst != NULL);
+  memcpy(dst, src, width * kNumChannels * height);
+}
+
+// Copy pixels in the given rectangle from 'src' to 'dst' honoring the 'stride'.
+static void CopyFrameRectangle(const uint8_t* src, uint8_t* dst, int stride,
+                               int x_offset, int y_offset,
+                               int width, int height) {
+  int j;
+  const int width_in_bytes = width * kNumChannels;
+  const size_t offset = y_offset * stride + x_offset * kNumChannels;
+  assert(width_in_bytes <= stride);
+  src += offset;
+  dst += offset;
+  for (j = 0; j < height; ++j) {
+    memcpy(dst, src, width_in_bytes);
+    src += stride;
+    dst += stride;
+  }
+}
+
+// Canonicalize all transparent pixels to transparent black to aid comparison.
+static void CleanupTransparentPixels(uint32_t* rgba,
+                                     uint32_t width, uint32_t height) {
+  const uint32_t* const rgba_end = rgba + width * height;
+  while (rgba < rgba_end) {
+    const uint8_t alpha = (*rgba >> 24) & 0xff;
+    if (alpha == 0) {
+      *rgba = 0;
+    }
+    ++rgba;
+  }
+}
+
+// Dump frame to a PAM file. Returns true on success.
+static int DumpFrame(const char filename[], const char dump_folder[],
+                     uint32_t frame_num, const uint8_t rgba[],
+                     int canvas_width, int canvas_height) {
+  int ok = 0;
+  size_t max_len;
+  int y;
+  const char* base_name = NULL;
+  char* file_name = NULL;
+  FILE* f = NULL;
+
+  base_name = strrchr(filename, '/');
+  base_name = (base_name == NULL) ? filename : base_name + 1;
+  max_len = strlen(dump_folder) + 1 + strlen(base_name)
+          + strlen("_frame_") + strlen(".pam") + 8;
+  file_name = (char*)malloc(max_len * sizeof(*file_name));
+  if (file_name == NULL) goto End;
+
+  if (snprintf(file_name, max_len, "%s/%s_frame_%d.pam",
+               dump_folder, base_name, frame_num) < 0) {
+    fprintf(stderr, "Error while generating file name\n");
+    goto End;
+  }
+
+  f = fopen(file_name, "wb");
+  if (f == NULL) {
+    fprintf(stderr, "Error opening file for writing: %s\n", file_name);
+    ok = 0;
+    goto End;
+  }
+  if (fprintf(f, "P7\nWIDTH %d\nHEIGHT %d\n"
+              "DEPTH 4\nMAXVAL 255\nTUPLTYPE RGB_ALPHA\nENDHDR\n",
+              canvas_width, canvas_height) < 0) {
+    fprintf(stderr, "Write error for file %s\n", file_name);
+    goto End;
+  }
+  for (y = 0; y < canvas_height; ++y) {
+    if (fwrite((const char*)(rgba) + y * canvas_width * kNumChannels,
+               canvas_width * kNumChannels, 1, f) != 1) {
+      fprintf(stderr, "Error writing to file: %s\n", file_name);
+      goto End;
+    }
+  }
+  ok = 1;
+ End:
+  if (f != NULL) fclose(f);
+  free(file_name);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// WebP Decoding.
+
+// Returns true if this is a valid WebP bitstream.
+static int IsWebP(const WebPData* const webp_data) {
+  return (WebPGetInfo(webp_data->bytes, webp_data->size, NULL, NULL) != 0);
+}
+
+// Read animated WebP bitstream 'file_str' into 'AnimatedImage' struct.
+static int ReadAnimatedWebP(const char filename[],
+                            const WebPData* const webp_data,
+                            AnimatedImage* const image, int dump_frames,
+                            const char dump_folder[]) {
+  int ok = 0;
+  int dump_ok = 1;
+  uint32_t frame_index = 0;
+  int prev_frame_timestamp = 0;
+  WebPAnimDecoder* dec;
+  WebPAnimInfo anim_info;
+
+  memset(image, 0, sizeof(*image));
+
+  dec = WebPAnimDecoderNew(webp_data, NULL);
+  if (dec == NULL) {
+    fprintf(stderr, "Error parsing image: %s\n", filename);
+    goto End;
+  }
+
+  if (!WebPAnimDecoderGetInfo(dec, &anim_info)) {
+    fprintf(stderr, "Error getting global info about the animation\n");
+    goto End;
+  }
+
+  // Animation properties.
+  image->canvas_width = anim_info.canvas_width;
+  image->canvas_height = anim_info.canvas_height;
+  image->loop_count = anim_info.loop_count;
+  image->bgcolor = anim_info.bgcolor;
+
+  // Allocate frames.
+  if (!AllocateFrames(image, anim_info.frame_count)) return 0;
+
+  // Decode frames.
+  while (WebPAnimDecoderHasMoreFrames(dec)) {
+    DecodedFrame* curr_frame;
+    uint8_t* curr_rgba;
+    uint8_t* frame_rgba;
+    int timestamp;
+
+    if (!WebPAnimDecoderGetNext(dec, &frame_rgba, &timestamp)) {
+      fprintf(stderr, "Error decoding frame #%u\n", frame_index);
+      goto End;
+    }
+    assert(frame_index < anim_info.frame_count);
+    curr_frame = &image->frames[frame_index];
+    curr_rgba = curr_frame->rgba;
+    curr_frame->duration = timestamp - prev_frame_timestamp;
+    curr_frame->is_key_frame = 0;  // Unused.
+    memcpy(curr_rgba, frame_rgba,
+           image->canvas_width * kNumChannels * image->canvas_height);
+
+    // Needed only because we may want to compare with GIF later.
+    CleanupTransparentPixels((uint32_t*)curr_rgba,
+                             image->canvas_width, image->canvas_height);
+
+    if (dump_frames && dump_ok) {
+      dump_ok = DumpFrame(filename, dump_folder, frame_index, curr_rgba,
+                          image->canvas_width, image->canvas_height);
+      if (!dump_ok) {  // Print error once, but continue decode loop.
+        fprintf(stderr, "Error dumping frames to %s\n", dump_folder);
+      }
+    }
+
+    ++frame_index;
+    prev_frame_timestamp = timestamp;
+  }
+  ok = dump_ok;
+
+ End:
+  WebPAnimDecoderDelete(dec);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// GIF Decoding.
+
+// Returns true if this is a valid GIF bitstream.
+static int IsGIF(const WebPData* const data) {
+  return data->size > GIF_STAMP_LEN &&
+         (!memcmp(GIF_STAMP, data->bytes, GIF_STAMP_LEN) ||
+          !memcmp(GIF87_STAMP, data->bytes, GIF_STAMP_LEN) ||
+          !memcmp(GIF89_STAMP, data->bytes, GIF_STAMP_LEN));
+}
+
+#ifdef WEBP_HAVE_GIF
+
+// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
+# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
+# define LOCAL_GIF_PREREQ(maj, min) \
+    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GIF_VERSION 0
+# define LOCAL_GIF_PREREQ(maj, min) 0
+#endif
+
+#if !LOCAL_GIF_PREREQ(5, 0)
+
+// Added in v5.0
+typedef struct {
+  int DisposalMode;
+#define DISPOSAL_UNSPECIFIED      0       // No disposal specified
+#define DISPOSE_DO_NOT            1       // Leave image in place
+#define DISPOSE_BACKGROUND        2       // Set area to background color
+#define DISPOSE_PREVIOUS          3       // Restore to previous content
+  int UserInputFlag;       // User confirmation required before disposal
+  int DelayTime;           // Pre-display delay in 0.01sec units
+  int TransparentColor;    // Palette index for transparency, -1 if none
+#define NO_TRANSPARENT_COLOR     -1
+} GraphicsControlBlock;
+
+static int DGifExtensionToGCB(const size_t GifExtensionLength,
+                              const GifByteType* GifExtension,
+                              GraphicsControlBlock* gcb) {
+  if (GifExtensionLength != 4) {
+    return GIF_ERROR;
+  }
+  gcb->DisposalMode = (GifExtension[0] >> 2) & 0x07;
+  gcb->UserInputFlag = (GifExtension[0] & 0x02) != 0;
+  gcb->DelayTime = GifExtension[1] | (GifExtension[2] << 8);
+  if (GifExtension[0] & 0x01) {
+    gcb->TransparentColor = (int)GifExtension[3];
+  } else {
+    gcb->TransparentColor = NO_TRANSPARENT_COLOR;
+  }
+  return GIF_OK;
+}
+
+static int DGifSavedExtensionToGCB(GifFileType* GifFile, int ImageIndex,
+                                   GraphicsControlBlock* gcb) {
+  int i;
+  if (ImageIndex < 0 || ImageIndex > GifFile->ImageCount - 1) {
+    return GIF_ERROR;
+  }
+  gcb->DisposalMode = DISPOSAL_UNSPECIFIED;
+  gcb->UserInputFlag = 0;
+  gcb->DelayTime = 0;
+  gcb->TransparentColor = NO_TRANSPARENT_COLOR;
+
+  for (i = 0; i < GifFile->SavedImages[ImageIndex].ExtensionBlockCount; i++) {
+    ExtensionBlock* ep = &GifFile->SavedImages[ImageIndex].ExtensionBlocks[i];
+    if (ep->Function == GRAPHICS_EXT_FUNC_CODE) {
+      return DGifExtensionToGCB(
+          ep->ByteCount, (const GifByteType*)ep->Bytes, gcb);
+    }
+  }
+  return GIF_ERROR;
+}
+
+#define CONTINUE_EXT_FUNC_CODE 0x00
+
+// Signature was changed in v5.0
+#define DGifOpenFileName(a, b) DGifOpenFileName(a)
+
+#endif  // !LOCAL_GIF_PREREQ(5, 0)
+
+// Signature changed in v5.1
+#if !LOCAL_GIF_PREREQ(5, 1)
+#define DGifCloseFile(a, b) DGifCloseFile(a)
+#endif
+
+static void GIFDisplayError(const GifFileType* const gif, int gif_error) {
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if LOCAL_GIF_PREREQ(4, 2)
+#if LOCAL_GIF_PREREQ(5, 0)
+  const char* error_str =
+      GifErrorString((gif == NULL) ? gif_error : gif->Error);
+#else
+  const char* error_str = GifErrorString();
+  (void)gif;
+#endif
+  if (error_str == NULL) error_str = "Unknown error";
+  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+  (void)gif;
+  fprintf(stderr, "GIFLib Error %d: ", gif_error);
+  PrintGifError();
+  fprintf(stderr, "\n");
+#endif
+}
+
+static int IsKeyFrameGIF(const GifImageDesc* prev_desc, int prev_dispose,
+                         const DecodedFrame* const prev_frame,
+                         int canvas_width, int canvas_height) {
+  if (prev_frame == NULL) return 1;
+  if (prev_dispose == DISPOSE_BACKGROUND) {
+    if (IsFullFrame(prev_desc->Width, prev_desc->Height,
+                    canvas_width, canvas_height)) {
+      return 1;
+    }
+    if (prev_frame->is_key_frame) return 1;
+  }
+  return 0;
+}
+
+static int GetTransparentIndexGIF(GifFileType* gif) {
+  GraphicsControlBlock first_gcb;
+  memset(&first_gcb, 0, sizeof(first_gcb));
+  DGifSavedExtensionToGCB(gif, 0, &first_gcb);
+  return first_gcb.TransparentColor;
+}
+
+static uint32_t GetBackgroundColorGIF(GifFileType* gif) {
+  const int transparent_index = GetTransparentIndexGIF(gif);
+  const ColorMapObject* const color_map = gif->SColorMap;
+  if (transparent_index != NO_TRANSPARENT_COLOR &&
+      gif->SBackGroundColor == transparent_index) {
+    return 0x00ffffff;  // Special case: transparent white.
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || gif->SBackGroundColor >= color_map->ColorCount) {
+    return 0xffffffff;  // Invalid: assume white.
+  } else {
+    const GifColorType color = color_map->Colors[gif->SBackGroundColor];
+    return (0xff << 24) |
+           (color.Red << 16) |
+           (color.Green << 8) |
+           (color.Blue << 0);
+  }
+}
+
+// Find appropriate app extension and get loop count from the next extension.
+static uint32_t GetLoopCountGIF(const GifFileType* const gif) {
+  int i;
+  for (i = 0; i < gif->ImageCount; ++i) {
+    const SavedImage* const image = &gif->SavedImages[i];
+    int j;
+    for (j = 0; (j + 1) < image->ExtensionBlockCount; ++j) {
+      const ExtensionBlock* const eb1 = image->ExtensionBlocks + j;
+      const ExtensionBlock* const eb2 = image->ExtensionBlocks + j + 1;
+      const char* const signature = (const char*)eb1->Bytes;
+      const int signature_is_ok =
+          (eb1->Function == APPLICATION_EXT_FUNC_CODE) &&
+          (eb1->ByteCount == 11) &&
+          (!memcmp(signature, "NETSCAPE2.0", 11) ||
+           !memcmp(signature, "ANIMEXTS1.0", 11));
+      if (signature_is_ok &&
+          eb2->Function == CONTINUE_EXT_FUNC_CODE && eb2->ByteCount >= 3 &&
+          eb2->Bytes[0] == 1) {
+        return ((uint32_t)(eb2->Bytes[2]) << 8) +
+               ((uint32_t)(eb2->Bytes[1]) << 0);
+      }
+    }
+  }
+  return 0;  // Default.
+}
+
+// Get duration of 'n'th frame in milliseconds.
+static int GetFrameDurationGIF(GifFileType* gif, int n) {
+  GraphicsControlBlock gcb;
+  memset(&gcb, 0, sizeof(gcb));
+  DGifSavedExtensionToGCB(gif, n, &gcb);
+  return gcb.DelayTime * 10;
+}
+
+// Returns true if frame 'target' completely covers 'covered'.
+static int CoversFrameGIF(const GifImageDesc* const target,
+                          const GifImageDesc* const covered) {
+  return target->Left <= covered->Left &&
+         covered->Left + covered->Width <= target->Left + target->Width &&
+         target->Top <= covered->Top &&
+         covered->Top + covered->Height <= target->Top + target->Height;
+}
+
+static void RemapPixelsGIF(const uint8_t* const src,
+                           const ColorMapObject* const cmap,
+                           int transparent_color, int len, uint8_t* dst) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    if (src[i] != transparent_color) {
+      // If a pixel in the current frame is transparent, we don't modify it, so
+      // that we can see-through the corresponding pixel from an earlier frame.
+      const GifColorType c = cmap->Colors[src[i]];
+      dst[4 * i + 0] = c.Red;
+      dst[4 * i + 1] = c.Green;
+      dst[4 * i + 2] = c.Blue;
+      dst[4 * i + 3] = 0xff;
+    }
+  }
+}
+
+static int ReadFrameGIF(const SavedImage* const gif_image,
+                        const ColorMapObject* cmap, int transparent_color,
+                        int out_stride, uint8_t* const dst) {
+  const GifImageDesc* image_desc = &gif_image->ImageDesc;
+  const uint8_t* in;
+  uint8_t* out;
+  int j;
+
+  if (image_desc->ColorMap) cmap = image_desc->ColorMap;
+
+  if (cmap == NULL || cmap->ColorCount != (1 << cmap->BitsPerPixel)) {
+    fprintf(stderr, "Potentially corrupt color map.\n");
+    return 0;
+  }
+
+  in = (const uint8_t*)gif_image->RasterBits;
+  out = dst + image_desc->Top * out_stride + image_desc->Left * kNumChannels;
+
+  for (j = 0; j < image_desc->Height; ++j) {
+    RemapPixelsGIF(in, cmap, transparent_color, image_desc->Width, out);
+    in += image_desc->Width;
+    out += out_stride;
+  }
+  return 1;
+}
+
+// Read animated GIF bitstream from 'filename' into 'AnimatedImage' struct.
+static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
+                           int dump_frames, const char dump_folder[]) {
+  uint32_t frame_count;
+  uint32_t canvas_width, canvas_height;
+  uint32_t i;
+  int gif_error;
+  GifFileType* gif;
+
+  gif = DGifOpenFileName(filename, NULL);
+  if (gif == NULL) {
+    fprintf(stderr, "Could not read file: %s.\n", filename);
+    return 0;
+  }
+
+  gif_error = DGifSlurp(gif);
+  if (gif_error != GIF_OK) {
+    fprintf(stderr, "Could not parse image: %s.\n", filename);
+    GIFDisplayError(gif, gif_error);
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }
+
+  // Animation properties.
+  image->canvas_width = (uint32_t)gif->SWidth;
+  image->canvas_height = (uint32_t)gif->SHeight;
+  if (image->canvas_width > MAX_CANVAS_SIZE ||
+      image->canvas_height > MAX_CANVAS_SIZE) {
+    fprintf(stderr, "Invalid canvas dimension: %d x %d\n",
+            image->canvas_width, image->canvas_height);
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }
+  image->loop_count = GetLoopCountGIF(gif);
+  image->bgcolor = GetBackgroundColorGIF(gif);
+
+  frame_count = (uint32_t)gif->ImageCount;
+  if (frame_count == 0) {
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }
+
+  if (image->canvas_width == 0 || image->canvas_height == 0) {
+    image->canvas_width = gif->SavedImages[0].ImageDesc.Width;
+    image->canvas_height = gif->SavedImages[0].ImageDesc.Height;
+    gif->SavedImages[0].ImageDesc.Left = 0;
+    gif->SavedImages[0].ImageDesc.Top = 0;
+    if (image->canvas_width == 0 || image->canvas_height == 0) {
+      fprintf(stderr, "Invalid canvas size in GIF.\n");
+      DGifCloseFile(gif, NULL);
+      return 0;
+    }
+  }
+  // Allocate frames.
+  AllocateFrames(image, frame_count);
+
+  canvas_width = image->canvas_width;
+  canvas_height = image->canvas_height;
+
+  // Decode and reconstruct frames.
+  for (i = 0; i < frame_count; ++i) {
+    const int canvas_width_in_bytes = canvas_width * kNumChannels;
+    const SavedImage* const curr_gif_image = &gif->SavedImages[i];
+    GraphicsControlBlock curr_gcb;
+    DecodedFrame* curr_frame;
+    uint8_t* curr_rgba;
+
+    memset(&curr_gcb, 0, sizeof(curr_gcb));
+    DGifSavedExtensionToGCB(gif, i, &curr_gcb);
+
+    curr_frame = &image->frames[i];
+    curr_rgba = curr_frame->rgba;
+    curr_frame->duration = GetFrameDurationGIF(gif, i);
+
+    if (i == 0) {  // Initialize as transparent.
+      curr_frame->is_key_frame = 1;
+      ZeroFillCanvas(curr_rgba, canvas_width, canvas_height);
+    } else {
+      DecodedFrame* const prev_frame = &image->frames[i - 1];
+      const GifImageDesc* const prev_desc = &gif->SavedImages[i - 1].ImageDesc;
+      GraphicsControlBlock prev_gcb;
+      memset(&prev_gcb, 0, sizeof(prev_gcb));
+      DGifSavedExtensionToGCB(gif, i - 1, &prev_gcb);
+
+      curr_frame->is_key_frame =
+          IsKeyFrameGIF(prev_desc, prev_gcb.DisposalMode, prev_frame,
+                        canvas_width, canvas_height);
+
+      if (curr_frame->is_key_frame) {  // Initialize as transparent.
+        ZeroFillCanvas(curr_rgba, canvas_width, canvas_height);
+      } else {
+        int prev_frame_disposed, curr_frame_opaque;
+        int prev_frame_completely_covered;
+        // Initialize with previous canvas.
+        uint8_t* const prev_rgba = image->frames[i - 1].rgba;
+        CopyCanvas(prev_rgba, curr_rgba, canvas_width, canvas_height);
+
+        // Dispose previous frame rectangle.
+        prev_frame_disposed =
+            (prev_gcb.DisposalMode == DISPOSE_BACKGROUND ||
+             prev_gcb.DisposalMode == DISPOSE_PREVIOUS);
+        curr_frame_opaque =
+            (curr_gcb.TransparentColor == NO_TRANSPARENT_COLOR);
+        prev_frame_completely_covered =
+            curr_frame_opaque &&
+            CoversFrameGIF(&curr_gif_image->ImageDesc, prev_desc);
+
+        if (prev_frame_disposed && !prev_frame_completely_covered) {
+          switch (prev_gcb.DisposalMode) {
+            case DISPOSE_BACKGROUND: {
+              ZeroFillFrameRect(curr_rgba, canvas_width_in_bytes,
+                                prev_desc->Left, prev_desc->Top,
+                                prev_desc->Width, prev_desc->Height);
+              break;
+            }
+            case DISPOSE_PREVIOUS: {
+              int src_frame_num = i - 2;
+              while (src_frame_num >= 0) {
+                GraphicsControlBlock src_frame_gcb;
+                memset(&src_frame_gcb, 0, sizeof(src_frame_gcb));
+                DGifSavedExtensionToGCB(gif, src_frame_num, &src_frame_gcb);
+                if (src_frame_gcb.DisposalMode != DISPOSE_PREVIOUS) break;
+                --src_frame_num;
+              }
+              if (src_frame_num >= 0) {
+                // Restore pixels inside previous frame rectangle to
+                // corresponding pixels in source canvas.
+                uint8_t* const src_frame_rgba =
+                    image->frames[src_frame_num].rgba;
+                CopyFrameRectangle(src_frame_rgba, curr_rgba,
+                                   canvas_width_in_bytes,
+                                   prev_desc->Left, prev_desc->Top,
+                                   prev_desc->Width, prev_desc->Height);
+              } else {
+                // Source canvas doesn't exist. So clear previous frame
+                // rectangle to background.
+                ZeroFillFrameRect(curr_rgba, canvas_width_in_bytes,
+                                  prev_desc->Left, prev_desc->Top,
+                                  prev_desc->Width, prev_desc->Height);
+              }
+              break;
+            }
+            default:
+              break;  // Nothing to do.
+          }
+        }
+      }
+    }
+
+    // Decode current frame.
+    if (!ReadFrameGIF(curr_gif_image, gif->SColorMap, curr_gcb.TransparentColor,
+                      canvas_width_in_bytes, curr_rgba)) {
+      DGifCloseFile(gif, NULL);
+      return 0;
+    }
+
+    if (dump_frames) {
+      if (!DumpFrame(filename, dump_folder, i, curr_rgba,
+                     canvas_width, canvas_height)) {
+        DGifCloseFile(gif, NULL);
+        return 0;
+      }
+    }
+  }
+  DGifCloseFile(gif, NULL);
+  return 1;
+}
+
+#else
+
+static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
+                           int dump_frames, const char dump_folder[]) {
+  (void)filename;
+  (void)image;
+  (void)dump_frames;
+  (void)dump_folder;
+  fprintf(stderr, "GIF support not compiled. Please install the libgif-dev "
+          "package before building.\n");
+  return 0;
+}
+
+#endif  // WEBP_HAVE_GIF
+
+// -----------------------------------------------------------------------------
+
+int ReadAnimatedImage(const char filename[], AnimatedImage* const image,
+                      int dump_frames, const char dump_folder[]) {
+  int ok = 0;
+  WebPData webp_data;
+
+  WebPDataInit(&webp_data);
+  memset(image, 0, sizeof(*image));
+
+  if (!ExUtilReadFile(filename, &webp_data.bytes, &webp_data.size)) {
+    fprintf(stderr, "Error reading file: %s\n", filename);
+    return 0;
+  }
+
+  if (IsWebP(&webp_data)) {
+    ok = ReadAnimatedWebP(filename, &webp_data, image, dump_frames,
+                          dump_folder);
+  } else if (IsGIF(&webp_data)) {
+    ok = ReadAnimatedGIF(filename, image, dump_frames, dump_folder);
+  } else {
+    fprintf(stderr,
+            "Unknown file type: %s. Supported file types are WebP and GIF\n",
+            filename);
+    ok = 0;
+  }
+  if (!ok) ClearAnimatedImage(image);
+  WebPDataClear(&webp_data);
+  return ok;
+}
+
+static void Accumulate(double v1, double v2, double* const max_diff,
+                       double* const sse) {
+  const double diff = fabs(v1 - v2);
+  if (diff > *max_diff) *max_diff = diff;
+  *sse += diff * diff;
+}
+
+void GetDiffAndPSNR(const uint8_t rgba1[], const uint8_t rgba2[],
+                    uint32_t width, uint32_t height, int premultiply,
+                    int* const max_diff, double* const psnr) {
+  const uint32_t stride = width * kNumChannels;
+  const int kAlphaChannel = kNumChannels - 1;
+  double f_max_diff = 0.;
+  double sse = 0.;
+  uint32_t x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < stride; x += kNumChannels) {
+      int k;
+      const size_t offset = y * stride + x;
+      const int alpha1 = rgba1[offset + kAlphaChannel];
+      const int alpha2 = rgba2[offset + kAlphaChannel];
+      Accumulate(alpha1, alpha2, &f_max_diff, &sse);
+      if (!premultiply) {
+        for (k = 0; k < kAlphaChannel; ++k) {
+          Accumulate(rgba1[offset + k], rgba2[offset + k], &f_max_diff, &sse);
+        }
+      } else {
+        // premultiply R/G/B channels with alpha value
+        for (k = 0; k < kAlphaChannel; ++k) {
+          Accumulate(rgba1[offset + k] * alpha1 / 255.,
+                     rgba2[offset + k] * alpha2 / 255.,
+                     &f_max_diff, &sse);
+        }
+      }
+    }
+  }
+  *max_diff = (int)f_max_diff;
+  if (*max_diff == 0) {
+    *psnr = 99.;  // PSNR when images are identical.
+  } else {
+    sse /= stride * height;
+    *psnr = 4.3429448 * log(255. * 255. / sse);
+  }
+}
--- a/examples/anim_util.h
+++ b/examples/anim_util.h
@ -0,0 +1,63 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for animated images
+
+#ifndef WEBP_EXAMPLES_ANIM_UTIL_H_
+#define WEBP_EXAMPLES_ANIM_UTIL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  uint8_t* rgba;         // Decoded and reconstructed full frame.
+  int duration;          // Frame duration in milliseconds.
+  int is_key_frame;      // True if this frame is a key-frame.
+} DecodedFrame;
+
+typedef struct {
+  uint32_t canvas_width;
+  uint32_t canvas_height;
+  uint32_t bgcolor;
+  uint32_t loop_count;
+  DecodedFrame* frames;
+  uint32_t num_frames;
+  void* raw_mem;
+} AnimatedImage;
+
+// Deallocate everything in 'image' (but not the object itself).
+void ClearAnimatedImage(AnimatedImage* const image);
+
+// Read animated image file into 'AnimatedImage' struct.
+// If 'dump_frames' is true, dump frames to 'dump_folder'.
+// Previous content of 'image' is obliterated.
+// Upon successful return, content of 'image' must be deleted by
+// calling 'ClearAnimatedImage'.
+int ReadAnimatedImage(const char filename[], AnimatedImage* const image,
+                      int dump_frames, const char dump_folder[]);
+
+// Given two RGBA buffers, calculate max pixel difference and PSNR.
+// If 'premultiply' is true, R/G/B values will be pre-multiplied by the
+// transparency before comparison.
+void GetDiffAndPSNR(const uint8_t rgba1[], const uint8_t rgba2[],
+                    uint32_t width, uint32_t height, int premultiply,
+                    int* const max_diff, double* const psnr);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_ANIM_UTIL_H_
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -20,17 +20,10 @@
 #include "webp/config.h"
 #endif

-#include "webp/encode.h"
-
 #include "./example_util.h"
-#include "./metadata.h"
+#include "./image_dec.h"
 #include "./stopwatch.h"
-
-#include "./jpegdec.h"
-#include "./pngdec.h"
-#include "./tiffdec.h"
-#include "./webpdec.h"
-#include "./wicdec.h"
+#include "webp/encode.h"

 #ifndef WEBP_DLL
 #ifdef __cplusplus
@ -48,127 +41,81 @@ extern void* VP8GetCPUInfo;   // opaque forward declaration.

 static int verbose = 0;

-static int ReadYUV(FILE* in_file, WebPPicture* const pic) {
+static int ReadYUV(const uint8_t* const data, size_t data_size,
+                   WebPPicture* const pic) {
  const int use_argb = pic->use_argb;
  const int uv_width = (pic->width + 1) / 2;
  const int uv_height = (pic->height + 1) / 2;
-  int y;
-  int ok = 0;
+  const int y_plane_size = pic->width * pic->height;
+  const int uv_plane_size = uv_width * uv_height;
+  const size_t expected_data_size = y_plane_size + 2 * uv_plane_size;
+
+  if (data_size != expected_data_size) {
+    fprintf(stderr,
+            "input data doesn't have the expected size (%d instead of %d)\n",
+            (int)data_size, (int)expected_data_size);
+    return 0;
+  }

  pic->use_argb = 0;
-  if (!WebPPictureAlloc(pic)) return ok;
-
-  for (y = 0; y < pic->height; ++y) {
-    if (fread(pic->y + y * pic->y_stride, pic->width, 1, in_file) != 1) {
-      goto End;
-    }
-  }
-  for (y = 0; y < uv_height; ++y) {
-    if (fread(pic->u + y * pic->uv_stride, uv_width, 1, in_file) != 1)
-      goto End;
-  }
-  for (y = 0; y < uv_height; ++y) {
-    if (fread(pic->v + y * pic->uv_stride, uv_width, 1, in_file) != 1)
-      goto End;
-  }
-  ok = 1;
-  if (use_argb) ok = WebPPictureYUVAToARGB(pic);
-
- End:
-  return ok;
+  if (!WebPPictureAlloc(pic)) return 0;
+  ExUtilCopyPlane(data, pic->width, pic->y, pic->y_stride,
+                  pic->width, pic->height);
+  ExUtilCopyPlane(data + y_plane_size, uv_width,
+                  pic->u, pic->uv_stride, uv_width, uv_height);
+  ExUtilCopyPlane(data + y_plane_size + uv_plane_size, uv_width,
+                  pic->v, pic->uv_stride, uv_width, uv_height);
+  return use_argb ? WebPPictureYUVAToARGB(pic) : 1;
 }

 #ifdef HAVE_WINCODEC_H

 static int ReadPicture(const char* const filename, WebPPicture* const pic,
                       int keep_alpha, Metadata* const metadata) {
-  int ok;
+  int ok = 0;
+  const uint8_t* data = NULL;
+  size_t data_size = 0;
  if (pic->width != 0 && pic->height != 0) {
-    // If image size is specified, infer it as YUV format.
-    FILE* in_file = fopen(filename, "rb");
-    if (in_file == NULL) {
-      fprintf(stderr, "Error! Cannot open input file '%s'\n", filename);
-      return 0;
-    }
-    ok = ReadYUV(in_file, pic);
-    fclose(in_file);
+    ok = ExUtilReadFile(filename, &data, &data_size);
+    ok = ok && ReadYUV(data, data_size, pic);
  } else {
    // If no size specified, try to decode it using WIC.
    ok = ReadPictureWithWIC(filename, pic, keep_alpha, metadata);
    if (!ok) {
-      ok = ReadWebP(filename, pic, keep_alpha, metadata);
+      ok = ExUtilReadFile(filename, &data, &data_size);
+      ok = ok && ReadWebP(data, data_size, pic, keep_alpha, metadata);
    }
  }
  if (!ok) {
    fprintf(stderr, "Error! Could not process file %s\n", filename);
  }
+  free((void*)data);
  return ok;
 }

 #else  // !HAVE_WINCODEC_H

-typedef enum {
-  PNG_ = 0,
-  JPEG_,
-  TIFF_,  // 'TIFF' clashes with libtiff
-  WEBP_,
-  UNSUPPORTED
-} InputFileFormat;
-
-static InputFileFormat GetImageType(FILE* in_file) {
-  InputFileFormat format = UNSUPPORTED;
-  uint32_t magic1, magic2;
-  uint8_t buf[12];
-
-  if ((fread(&buf[0], 12, 1, in_file) != 1) ||
-      (fseek(in_file, 0, SEEK_SET) != 0)) {
-    return format;
-  }
-
-  magic1 = ((uint32_t)buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
-  magic2 = ((uint32_t)buf[8] << 24) | (buf[9] << 16) | (buf[10] << 8) | buf[11];
-  if (magic1 == 0x89504E47U) {
-    format = PNG_;
-  } else if (magic1 >= 0xFFD8FF00U && magic1 <= 0xFFD8FFFFU) {
-    format = JPEG_;
-  } else if (magic1 == 0x49492A00 || magic1 == 0x4D4D002A) {
-    format = TIFF_;
-  } else if (magic1 == 0x52494646 && magic2 == 0x57454250) {
-    format = WEBP_;
-  }
-  return format;
-}
-
 static int ReadPicture(const char* const filename, WebPPicture* const pic,
                       int keep_alpha, Metadata* const metadata) {
+  const uint8_t* data = NULL;
+  size_t data_size = 0;
  int ok = 0;
-  FILE* in_file = fopen(filename, "rb");
-  if (in_file == NULL) {
-    fprintf(stderr, "Error! Cannot open input file '%s'\n", filename);
-    return ok;
-  }
+
+  ok = ExUtilReadFile(filename, &data, &data_size);
+  if (!ok) goto End;

  if (pic->width == 0 || pic->height == 0) {
-    // If no size specified, try to decode it as PNG/JPEG (as appropriate).
-    const InputFileFormat format = GetImageType(in_file);
-    if (format == PNG_) {
-      ok = ReadPNG(in_file, pic, keep_alpha, metadata);
-    } else if (format == JPEG_) {
-      ok = ReadJPEG(in_file, pic, metadata);
-    } else if (format == TIFF_) {
-      ok = ReadTIFF(filename, pic, keep_alpha, metadata);
-    } else if (format == WEBP_) {
-      ok = ReadWebP(filename, pic, keep_alpha, metadata);
-    }
+    WebPImageReader reader = WebPGuessImageReader(data, data_size);
+    ok = (reader != NULL) && reader(data, data_size, pic, keep_alpha, metadata);
  } else {
    // If image size is specified, infer it as YUV format.
-    ok = ReadYUV(in_file, pic);
+    ok = ReadYUV(data, data_size, pic);
  }
+ End:
  if (!ok) {
    fprintf(stderr, "Error! Could not process file %s\n", filename);
  }
-
-  fclose(in_file);
+  free((void*)data);
  return ok;
 }

@ -212,6 +159,8 @@ static void PrintFullLosslessInfo(const WebPAuxStats* const stats,
                                  const char* const description) {
  fprintf(stderr, "Lossless-%s compressed size: %d bytes\n",
          description, stats->lossless_size);
+  fprintf(stderr, "  * Header size: %d bytes, image data size: %d\n",
+          stats->lossless_hdr_size, stats->lossless_data_size);
  if (stats->lossless_features) {
    fprintf(stderr, "  * Lossless features used:");
    if (stats->lossless_features & 1) fprintf(stderr, " PREDICTION");
@ -565,8 +514,8 @@ static void HelpLong(void) {
  printf("Windows builds can take as input any of the files handled by WIC.\n");
 #endif
  printf("\nOptions:\n");
-  printf("  -h / -help  ............ short help\n");
-  printf("  -H / -longhelp  ........ long help\n");
+  printf("  -h / -help ............. short help\n");
+  printf("  -H / -longhelp ......... long help\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
  printf("  -alpha_q <int> ......... transparency-compression quality "
         "(0..100)\n");
@ -574,10 +523,8 @@ static void HelpLong(void) {
  printf("                            default, photo, picture,\n");
  printf("                            drawing, icon, text\n");
  printf("     -preset must come first, as it overwrites other parameters\n");
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
  printf("  -z <int> ............... activates lossless preset with given\n"
         "                           level in [0:fast, ..., 9:slowest]\n");
-#endif
  printf("\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
  printf("  -segments <int> ........ number of segments to use (1..4)\n");
@ -608,13 +555,19 @@ static void HelpLong(void) {
  printf("  -alpha_method <int> .... transparency-compression method (0..1)\n");
  printf("  -alpha_filter <string> . predictive filtering for alpha plane,\n");
  printf("                           one of: none, fast (default) or best\n");
-  printf("  -alpha_cleanup ......... clean RGB values in transparent area\n");
+  printf("  -exact ................. preserve RGB values in transparent area"
+         "\n");
  printf("  -blend_alpha <hex> ..... blend colors against background color\n"
         "                           expressed as RGB values written in\n"
         "                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0\n"
         "                           green=0xe0 and blue=0xd0\n");
  printf("  -noalpha ............... discard any transparency information\n");
  printf("  -lossless .............. encode image losslessly\n");
+  printf("  -near_lossless <int> ... use near-lossless image\n"
+         "                           preprocessing (0..100=off)\n");
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  printf("  -delta_palettization ... use delta palettization\n");
+#endif  // WEBP_EXPERIMENTAL_FEATURES
  printf("  -hint <string> ......... specify image characteristics hint,\n");
  printf("                           one of: photo, picture or graph\n");

@ -679,10 +632,8 @@ int main(int argc, const char *argv[]) {
  uint32_t background_color = 0xffffffu;
  int crop = 0, crop_x = 0, crop_y = 0, crop_w = 0, crop_h = 0;
  int resize_w = 0, resize_h = 0;
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
  int lossless_preset = 6;
  int use_lossless_preset = -1;  // -1=unset, 0=don't use, 1=use it
-#endif
  int show_progress = 0;
  int keep_metadata = 0;
  int metadata_written = 0;
@ -736,25 +687,31 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-s") && c < argc - 2) {
      picture.width = ExUtilGetInt(argv[++c], 0, &parse_error);
      picture.height = ExUtilGetInt(argv[++c], 0, &parse_error);
+      if (picture.width > WEBP_MAX_DIMENSION || picture.width < 0 ||
+          picture.height > WEBP_MAX_DIMENSION ||  picture.height < 0) {
+        fprintf(stderr,
+                "Specified dimension (%d x %d) is out of range.\n",
+                picture.width, picture.height);
+        goto Error;
+      }
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
-#endif
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
    } else if (!strcmp(argv[c], "-z") && c < argc - 1) {
      lossless_preset = ExUtilGetInt(argv[++c], 0, &parse_error);
      if (use_lossless_preset != 0) use_lossless_preset = 1;
-#endif
    } else if (!strcmp(argv[c], "-alpha_q") && c < argc - 1) {
      config.alpha_quality = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_method") && c < argc - 1) {
      config.alpha_compression = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_cleanup")) {
-      keep_alpha = keep_alpha ? 2 : 0;
+      // This flag is obsolete, does opposite of -exact.
+      config.exact = 0;
+    } else if (!strcmp(argv[c], "-exact")) {
+      config.exact = 1;
    } else if (!strcmp(argv[c], "-blend_alpha") && c < argc - 1) {
      blend_alpha = 1;
      // background color is given in hex with an optional '0x' prefix
@ -776,6 +733,14 @@ int main(int argc, const char *argv[]) {
      keep_alpha = 0;
    } else if (!strcmp(argv[c], "-lossless")) {
      config.lossless = 1;
+    } else if (!strcmp(argv[c], "-near_lossless") && c < argc - 1) {
+      config.near_lossless = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.lossless = 1;  // use near-lossless only with lossless
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    } else if (!strcmp(argv[c], "-delta_palettization")) {
+      config.delta_palettization = 1;
+      config.lossless = 1;  // use delta-palettization only with lossless
+#endif  // WEBP_EXPERIMENTAL_FEATURES
    } else if (!strcmp(argv[c], "-hint") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "photo")) {
@ -836,7 +801,7 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetEncoderVersion();
      printf("%d.%d.%d\n",
-        (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-progress")) {
      show_progress = 1;
@ -935,14 +900,12 @@ int main(int argc, const char *argv[]) {
    goto Error;
  }

-#if WEBP_ENCODER_ABI_VERSION > 0x0202
  if (use_lossless_preset == 1) {
    if (!WebPConfigLosslessPreset(&config, lossless_preset)) {
      fprintf(stderr, "Invalid lossless preset (-z %d)\n", lossless_preset);
      goto Error;
    }
  }
-#endif

  // Check for unsupported command line options for lossless mode and log
  // warning for such options.
@ -956,13 +919,22 @@ int main(int argc, const char *argv[]) {
                      " encoding. Ignoring this option!\n");
    }
  }
+  // If a target size or PSNR was given, but somehow the -pass option was
+  // omitted, force a reasonable value.
+  if (config.target_size > 0 || config.target_PSNR > 0) {
+    if (config.pass == 1) config.pass = 6;
+  }

  if (!WebPValidateConfig(&config)) {
    fprintf(stderr, "Error! Invalid configuration.\n");
    goto Error;
  }

-  // Read the input
+  // Read the input. We need to decide if we prefer ARGB or YUVA
+  // samples, depending on the expected compression mode (this saves
+  // some conversion steps).
+  picture.use_argb = (config.lossless || config.preprocessing > 0 ||
+                      crop || (resize_w | resize_h) > 0);
  if (verbose) {
    StopwatchReset(&stop_watch);
  }
@ -977,10 +949,6 @@ int main(int argc, const char *argv[]) {
    WebPBlendAlpha(&picture, background_color);
  }

-  if (keep_alpha == 2) {
-    WebPCleanupTransparentArea(&picture);
-  }
-
  if (verbose) {
    const double read_time = StopwatchReadAndReset(&stop_watch);
    fprintf(stderr, "Time to read input: %.3fs\n", read_time);
@ -1017,7 +985,7 @@ int main(int argc, const char *argv[]) {
    picture.user_data = (void*)in_file;
  }

-  // Compress
+  // Crop & resize.
  if (verbose) {
    StopwatchReset(&stop_watch);
  }
@ -1034,12 +1002,22 @@ int main(int argc, const char *argv[]) {
      goto Error;
    }
  }
+  if (verbose && (crop != 0 || (resize_w | resize_h) > 0)) {
+    const double preproc_time = StopwatchReadAndReset(&stop_watch);
+    fprintf(stderr, "Time to crop/resize picture: %.3fs\n", preproc_time);
+  }
+
  if (picture.extra_info_type > 0) {
    AllocExtraInfo(&picture);
  }
  if (print_distortion >= 0) {  // Save original picture for later comparison
    WebPPictureCopy(&picture, &original_picture);
  }
+
+  // Compress.
+  if (verbose) {
+    StopwatchReset(&stop_watch);
+  }
  if (!WebPEncode(&config, &picture)) {
    fprintf(stderr, "Error! Cannot encode picture as WebP\n");
    fprintf(stderr, "Error code: %d (%s)\n",
@ -1103,16 +1081,19 @@ int main(int argc, const char *argv[]) {
    if (print_distortion >= 0) {    // print distortion
      static const char* distortion_names[] = { "PSNR", "SSIM", "LSIM" };
      float values[5];
-      // Comparison is performed in YUVA colorspace.
-      if (original_picture.use_argb &&
-          !WebPPictureARGBToYUVA(&original_picture, WEBP_YUV420A)) {
-       fprintf(stderr, "Error while converting original picture to YUVA.\n");
-        goto Error;
-      }
-      if (picture.use_argb &&
-          !WebPPictureARGBToYUVA(&picture, WEBP_YUV420A)) {
-        fprintf(stderr, "Error while converting compressed picture to YUVA.\n");
-        goto Error;
+      if (picture.use_argb != original_picture.use_argb) {
+        // Somehow, the WebPEncode() call converted the original picture.
+        // We need to make both match before calling WebPPictureDistortion().
+        int ok = 0;
+        if (picture.use_argb) {
+          ok = WebPPictureYUVAToARGB(&original_picture);
+        } else {
+          ok = WebPPictureARGBToYUVA(&original_picture, WEBP_YUV420A);
+        }
+        if (!ok) {
+          fprintf(stderr, "Error while converting original picture.\n");
+          goto Error;
+        }
      }
      if (!WebPPictureDistortion(&picture, &original_picture,
                                 print_distortion, values)) {
@ -1120,9 +1101,14 @@ int main(int argc, const char *argv[]) {
        goto Error;
      }
      if (!short_output) {
-        fprintf(stderr, "%s: Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
-                distortion_names[print_distortion],
-                values[0], values[1], values[2], values[3], values[4]);
+        fprintf(stderr, "%s: ", distortion_names[print_distortion]);
+        if (picture.use_argb) {
+          fprintf(stderr, "B:%.2f G:%.2f R:%.2f A:%.2f  Total:%.2f\n",
+                  values[0], values[1], values[2], values[3], values[4]);
+        } else {
+          fprintf(stderr, "Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
+                  values[0], values[1], values[2], values[3], values[4]);
+        }
      } else {
        fprintf(stderr, "%7d %.4f\n", picture.stats->coded_size, values[4]);
      }
@ -1134,11 +1120,7 @@ int main(int argc, const char *argv[]) {
  return_value = 0;

 Error:
-#if WEBP_ENCODER_ABI_VERSION > 0x0203
  WebPMemoryWriterClear(&memory_writer);
-#else
-  free(memory_writer.mem);
-#endif
  free(picture.extra_info);
  MetadataFree(&metadata);
  WebPPictureFree(&picture);
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -44,6 +44,7 @@
 #include "./stopwatch.h"

 static int verbose = 0;
+static int quiet = 0;
 #ifndef WEBP_DLL
 #ifdef __cplusplus
 extern "C" {
@ -66,8 +67,13 @@ typedef enum {
  PGM,
  BMP,
  TIFF,
-  YUV,
-  ALPHA_PLANE_ONLY  // this is for experimenting only
+  RAW_YUV,
+  ALPHA_PLANE_ONLY,  // this is for experimenting only
+  // forced colorspace output (for testing, mostly)
+  RGB, RGBA, BGR, BGRA, ARGB,
+  RGBA_4444, RGB_565,
+  rgbA, bgrA, Argb, rgbA_4444,
+  YUV, YUVA
 } OutputFileFormat;

 #ifdef HAVE_WINCODEC_H
@ -174,7 +180,7 @@ static int WritePNG(const char* out_file_name, int use_stdout,
  const uint32_t height = buffer->height;
  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
-  const int has_alpha = (buffer->colorspace == MODE_BGRA);
+  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);

  return SUCCEEDED(WriteUsingWIC(out_file_name, use_stdout,
                                 MAKE_REFGUID(GUID_ContainerFormatPng),
@ -192,7 +198,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  const uint32_t height = buffer->height;
  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
-  const int has_alpha = (buffer->colorspace == MODE_RGBA);
+  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
  volatile png_structp png;
  volatile png_infop info;
  png_uint_32 y;
@ -258,6 +264,24 @@ static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  return 1;
 }

+// Save 16b mode (RGBA4444, RGB565, ...) for debugging purpose.
+static int Write16bAsPGM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const rgba = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const uint32_t bytes_per_px = 2;
+  uint32_t y;
+
+  fprintf(fout, "P5\n%u %u\n255\n", width * bytes_per_px, height);
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgba + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
 static void PutLE16(uint8_t* const dst, uint32_t value) {
  dst[0] = (value >> 0) & 0xff;
  dst[1] = (value >> 8) & 0xff;
@ -270,7 +294,7 @@ static void PutLE32(uint8_t* const dst, uint32_t value) {

 #define BMP_HEADER_SIZE 54
 static int WriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
-  const int has_alpha = (buffer->colorspace != MODE_BGR);
+  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
  const uint8_t* const rgba = buffer->u.RGBA.rgba;
@ -331,7 +355,7 @@ static int WriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
 #define TIFF_HEADER_SIZE (EXTRA_DATA_OFFSET + EXTRA_DATA_SIZE)

 static int WriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
-  const int has_alpha = (buffer->colorspace != MODE_RGB);
+  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
  const uint8_t* const rgba = buffer->u.RGBA.rgba;
@ -417,7 +441,7 @@ static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
 // format=PGM: save a grayscale PGM file using the IMC4 layout
 // (http://www.fourcc.org/yuv.php#IMC4). This is a very convenient format for
 // viewing the samples, esp. for odd dimensions.
-// format=YUV: just save the Y/U/V/A planes sequentially without header.
+// format=RAW_YUV: just save the Y/U/V/A planes sequentially without header.
 static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
                         OutputFileFormat format) {
  const int width = buffer->width;
@ -425,7 +449,7 @@ static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
  int ok = 1;
  int y;
-  const int pad = (format == YUV) ? 0 : 1;
+  const int pad = (format == RAW_YUV) ? 0 : 1;
  const int uv_width = (width + 1) / 2;
  const int uv_height = (height + 1) / 2;
  const int out_stride = (width + pad) & ~pad;
@ -486,7 +510,9 @@ static int SaveOutput(const WebPDecBuffer* const buffer,
    }
  }

-  if (format == PNG) {
+  if (format == PNG ||
+      format == RGBA || format == BGRA || format == ARGB ||
+      format == rgbA || format == bgrA || format == Argb) {
 #ifdef HAVE_WINCODEC_H
    ok &= WritePNG(out_file, use_stdout, buffer);
 #else
@ -494,14 +520,17 @@ static int SaveOutput(const WebPDecBuffer* const buffer,
 #endif
  } else if (format == PAM) {
    ok &= WritePPM(fout, buffer, 1);
-  } else if (format == PPM) {
+  } else if (format == PPM || format == RGB || format == BGR) {
    ok &= WritePPM(fout, buffer, 0);
+  } else if (format == RGBA_4444 || format == RGB_565 || format == rgbA_4444) {
+    ok &= Write16bAsPGM(fout, buffer);
  } else if (format == BMP) {
    ok &= WriteBMP(fout, buffer);
  } else if (format == TIFF) {
    ok &= WriteTIFF(fout, buffer);
-  } else if (format == PGM || format == YUV) {
-    ok &= WritePGMOrYUV(fout, buffer, format);
+  } else if (format == PGM || format == RAW_YUV ||
+             format == YUV || format == YUVA) {
+    ok &= WritePGMOrYUV(fout, buffer, format == RAW_YUV ? RAW_YUV : PGM);
  } else if (format == ALPHA_PLANE_ONLY) {
    ok &= WriteAlphaPlane(fout, buffer);
  }
@ -509,10 +538,12 @@ static int SaveOutput(const WebPDecBuffer* const buffer,
    fclose(fout);
  }
  if (ok) {
-    if (use_stdout) {
-      fprintf(stderr, "Saved to stdout\n");
-    } else {
-      fprintf(stderr, "Saved file %s\n", out_file);
+    if (!quiet) {
+      if (use_stdout) {
+        fprintf(stderr, "Saved to stdout\n");
+      } else {
+        fprintf(stderr, "Saved file %s\n", out_file);
+      }
    }
    if (verbose) {
      const double write_time = StopwatchReadAndReset(&stop_watch);
@ -541,24 +572,21 @@ static void Help(void) {
         "  -yuv ......... save the raw YUV samples in flat layout\n"
         "\n"
         " Other options are:\n"
-         "  -version  .... print version number and exit\n"
+         "  -version ..... print version number and exit\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
         "  -nofilter .... disable in-loop filtering\n"
         "  -nodither .... disable dithering\n"
         "  -dither <d> .. dithering strength (in 0..100)\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -alpha_dither  use alpha-plane dithering if needed\n"
-#endif
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
-         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+         "  -resize <w> <h> ......... scale the output (*after* any cropping)\n"
         "  -flip ........ flip the output vertically\n"
-#endif
         "  -alpha ....... only save the alpha plane\n"
         "  -incremental . use incremental decoding (useful for tests)\n"
-         "  -h     ....... this help message\n"
-         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
+         "  -h ........... this help message\n"
+         "  -v ........... verbose (e.g. print encoding/decoding times)\n"
+         "  -quiet ....... quiet mode, don't print anything\n"
 #ifndef WEBP_DLL
         "  -noasm ....... disable all assembly optimizations\n"
 #endif
@ -569,6 +597,70 @@ static const char* const kFormatType[] = {
  "unspecified", "lossy", "lossless"
 };

+static uint8_t* AllocateExternalBuffer(WebPDecoderConfig* config,
+                                       OutputFileFormat format,
+                                       int use_external_memory) {
+  uint8_t* external_buffer = NULL;
+  WebPDecBuffer* const output_buffer = &config->output;
+  int w = config->input.width;
+  int h = config->input.height;
+  if (config->options.use_scaling) {
+    w = config->options.scaled_width;
+    h = config->options.scaled_height;
+  } else if (config->options.use_cropping) {
+    w = config->options.crop_width;
+    h = config->options.crop_height;
+  }
+  if (format >= RGB && format <= rgbA_4444) {
+    const int bpp = (format == RGB || format == BGR) ? 3
+                  : (format == RGBA_4444 || format == rgbA_4444 ||
+                     format == RGB_565) ? 2
+                  : 4;
+    uint32_t stride = bpp * w + 7;   // <- just for exercising
+    external_buffer = (uint8_t*)malloc(stride * h);
+    if (external_buffer == NULL) return NULL;
+    output_buffer->u.RGBA.stride = stride;
+    output_buffer->u.RGBA.size = stride * h;
+    output_buffer->u.RGBA.rgba = external_buffer;
+  } else {    // YUV and YUVA
+    const int has_alpha = WebPIsAlphaMode(output_buffer->colorspace);
+    uint8_t* tmp;
+    uint32_t stride = w + 3;
+    uint32_t uv_stride = (w + 1) / 2 + 13;
+    uint32_t total_size = stride * h * (has_alpha ? 2 : 1)
+                        + 2 * uv_stride * (h + 1) / 2;
+    assert(format >= YUV && format <= YUVA);
+    external_buffer = (uint8_t*)malloc(total_size);
+    if (external_buffer == NULL) return NULL;
+    tmp = external_buffer;
+    output_buffer->u.YUVA.y = tmp;
+    output_buffer->u.YUVA.y_stride = stride;
+    output_buffer->u.YUVA.y_size = stride * h;
+    tmp += output_buffer->u.YUVA.y_size;
+    if (has_alpha) {
+      output_buffer->u.YUVA.a = tmp;
+      output_buffer->u.YUVA.a_stride = stride;
+      output_buffer->u.YUVA.a_size = stride * h;
+      tmp += output_buffer->u.YUVA.a_size;
+    } else {
+      output_buffer->u.YUVA.a = NULL;
+      output_buffer->u.YUVA.a_stride = 0;
+    }
+    output_buffer->u.YUVA.u = tmp;
+    output_buffer->u.YUVA.u_stride = uv_stride;
+    output_buffer->u.YUVA.u_size = uv_stride * (h + 1) / 2;
+    tmp += output_buffer->u.YUVA.u_size;
+
+    output_buffer->u.YUVA.v = tmp;
+    output_buffer->u.YUVA.v_stride = uv_stride;
+    output_buffer->u.YUVA.v_size = uv_stride * (h + 1) / 2;
+    tmp += output_buffer->u.YUVA.v_size;
+    assert(tmp <= external_buffer + total_size);
+  }
+  output_buffer->is_external_memory = use_external_memory;
+  return external_buffer;
+}
+
 int main(int argc, const char *argv[]) {
  int ok = 0;
  const char *in_file = NULL;
@ -578,6 +670,10 @@ int main(int argc, const char *argv[]) {
  WebPDecBuffer* const output_buffer = &config.output;
  WebPBitstreamFeatures* const bitstream = &config.input;
  OutputFileFormat format = PNG;
+  uint8_t* external_buffer = NULL;
+  int use_external_memory = 0;
+  const uint8_t* data = NULL;
+
  int incremental = 0;
  int c;

@ -607,6 +703,8 @@ int main(int argc, const char *argv[]) {
      format = BMP;
    } else if (!strcmp(argv[c], "-tiff")) {
      format = TIFF;
+    } else if (!strcmp(argv[c], "-quiet")) {
+      quiet = 1;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
@ -615,13 +713,36 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-pgm")) {
      format = PGM;
    } else if (!strcmp(argv[c], "-yuv")) {
-      format = YUV;
+      format = RAW_YUV;
+    } else if (!strcmp(argv[c], "-pixel_format") && c < argc - 1) {
+      const char* const fmt = argv[++c];
+      if      (!strcmp(fmt, "RGB"))  format = RGB;
+      else if (!strcmp(fmt, "RGBA")) format = RGBA;
+      else if (!strcmp(fmt, "BGR"))  format = BGR;
+      else if (!strcmp(fmt, "BGRA")) format = BGRA;
+      else if (!strcmp(fmt, "ARGB")) format = ARGB;
+      else if (!strcmp(fmt, "RGBA_4444")) format = RGBA_4444;
+      else if (!strcmp(fmt, "RGB_565")) format = RGB_565;
+      else if (!strcmp(fmt, "rgbA")) format = rgbA;
+      else if (!strcmp(fmt, "bgrA")) format = bgrA;
+      else if (!strcmp(fmt, "Argb")) format = Argb;
+      else if (!strcmp(fmt, "rgbA_4444")) format = rgbA_4444;
+      else if (!strcmp(fmt, "YUV"))  format = YUV;
+      else if (!strcmp(fmt, "YUVA")) format = YUVA;
+      else {
+        fprintf(stderr, "Can't parse pixel_format %s\n", fmt);
+        parse_error = 1;
+      }
+    } else if (!strcmp(argv[c], "-external_memory") && c < argc - 1) {
+      use_external_memory = ExUtilGetInt(argv[++c], 0, &parse_error);
+      parse_error |= (use_external_memory > 2 || use_external_memory < 0);
+      if (parse_error) {
+        fprintf(stderr, "Can't parse 'external_memory' value %s\n", argv[c]);
+      }
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
-#if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-alpha_dither")) {
      config.options.alpha_dithering_strength = 100;
-#endif
    } else if (!strcmp(argv[c], "-nodither")) {
      config.options.dithering_strength = 0;
    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
@ -633,14 +754,13 @@ int main(int argc, const char *argv[]) {
      config.options.crop_top    = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.options.crop_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.options.crop_height = ExUtilGetInt(argv[++c], 0, &parse_error);
-    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
+    } else if ((!strcmp(argv[c], "-scale") || !strcmp(argv[c], "-resize")) &&
+               c < argc - 2) {  // '-scale' is left for compatibility
      config.options.use_scaling = 1;
      config.options.scaled_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.options.scaled_height = ExUtilGetInt(argv[++c], 0, &parse_error);
-#if WEBP_DECODER_ABI_VERSION > 0x0203
    } else if (!strcmp(argv[c], "-flip")) {
      config.options.flip = 1;
-#endif
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
 #ifndef WEBP_DLL
@ -672,10 +792,11 @@ int main(int argc, const char *argv[]) {
    return -1;
  }

+  if (quiet) verbose = 0;
+
  {
    VP8StatusCode status = VP8_STATUS_OK;
    size_t data_size = 0;
-    const uint8_t* data = NULL;
    if (!ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
      return -1;
    }
@ -702,15 +823,33 @@ int main(int argc, const char *argv[]) {
            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
        break;
      case PGM:
-      case YUV:
+      case RAW_YUV:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
        break;
      case ALPHA_PLANE_ONLY:
        output_buffer->colorspace = MODE_YUVA;
        break;
-      default:
-        free((void*)data);
-        return -1;
+      // forced modes:
+      case RGB: output_buffer->colorspace = MODE_RGB; break;
+      case RGBA: output_buffer->colorspace = MODE_RGBA; break;
+      case BGR: output_buffer->colorspace = MODE_BGR; break;
+      case BGRA: output_buffer->colorspace = MODE_BGRA; break;
+      case ARGB: output_buffer->colorspace = MODE_ARGB; break;
+      case RGBA_4444: output_buffer->colorspace = MODE_RGBA_4444; break;
+      case RGB_565: output_buffer->colorspace = MODE_RGB_565; break;
+      case rgbA: output_buffer->colorspace = MODE_rgbA; break;
+      case bgrA: output_buffer->colorspace = MODE_bgrA; break;
+      case Argb: output_buffer->colorspace = MODE_Argb; break;
+      case rgbA_4444: output_buffer->colorspace = MODE_rgbA_4444; break;
+      case YUV: output_buffer->colorspace = MODE_YUV; break;
+      case YUVA: output_buffer->colorspace = MODE_YUVA; break;
+      default: goto Exit;
+    }
+
+    if (use_external_memory > 0 && format >= RGB) {
+      external_buffer = AllocateExternalBuffer(&config, format,
+                                               use_external_memory);
+      if (external_buffer == NULL) goto Exit;
    }

    if (incremental) {
@ -719,7 +858,6 @@ int main(int argc, const char *argv[]) {
      status = ExUtilDecodeWebP(data, data_size, verbose, &config);
    }

-    free((void*)data);
    ok = (status == VP8_STATUS_OK);
    if (!ok) {
      ExUtilPrintWebPError(in_file, status);
@ -728,23 +866,29 @@ int main(int argc, const char *argv[]) {
  }

  if (out_file != NULL) {
-    fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
-                    "Now saving...\n",
-            in_file, output_buffer->width, output_buffer->height,
-            bitstream->has_alpha ? " (with alpha)" : "",
-            kFormatType[bitstream->format]);
+    if (!quiet) {
+      fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
+                      "Now saving...\n",
+              in_file, output_buffer->width, output_buffer->height,
+              bitstream->has_alpha ? " (with alpha)" : "",
+              kFormatType[bitstream->format]);
+    }
    ok = SaveOutput(output_buffer, format, out_file);
  } else {
-    fprintf(stderr, "File %s can be decoded "
-                    "(dimensions: %d x %d %s. Format: %s).\n",
-            in_file, output_buffer->width, output_buffer->height,
-            bitstream->has_alpha ? " (with alpha)" : "",
-            kFormatType[bitstream->format]);
-    fprintf(stderr, "Nothing written; "
-                    "use -o flag to save the result as e.g. PNG.\n");
+    if (!quiet) {
+      fprintf(stderr, "File %s can be decoded "
+                      "(dimensions: %d x %d %s. Format: %s).\n",
+              in_file, output_buffer->width, output_buffer->height,
+              bitstream->has_alpha ? " (with alpha)" : "",
+              kFormatType[bitstream->format]);
+      fprintf(stderr, "Nothing written; "
+                      "use -o flag to save the result as e.g. PNG.\n");
+    }
  }
 Exit:
  WebPFreeDecBuffer(output_buffer);
+  free((void*)external_buffer);
+  free((void*)data);
  return ok ? 0 : -1;
 }

--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -243,7 +243,19 @@ VP8StatusCode ExUtilDecodeWebPIncremental(
      fprintf(stderr, "Failed during WebPINewDecoder().\n");
      return VP8_STATUS_OUT_OF_MEMORY;
    } else {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      size_t size = 0;
+      const size_t incr = 2 + (data_size / 20);
+      while (size < data_size) {
+        size_t next_size = size + (rand() % incr);
+        if (next_size > data_size) next_size = data_size;
+        status = WebPIUpdate(idec, data, next_size);
+        if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) break;
+        size = next_size;
+      }
+#else
      status = WebPIUpdate(idec, data, data_size);
+#endif
      WebPIDelete(idec);
    }
  }
@ -256,3 +268,14 @@ VP8StatusCode ExUtilDecodeWebPIncremental(
 }

 // -----------------------------------------------------------------------------
+
+void ExUtilCopyPlane(const uint8_t* src, int src_stride,
+                     uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width * sizeof(*dst));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -53,6 +53,12 @@ int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size);
 int ExUtilWriteFile(const char* const file_name,
                    const uint8_t* data, size_t data_size);

+//------------------------------------------------------------------------------
+
+// Copy width x height pixels from 'src' to 'dst' honoring the strides.
+void ExUtilCopyPlane(const uint8_t* src, int src_stride,
+                     uint8_t* dst, int dst_stride, int width, int height);
+
 //------------------------------------------------------------------------------
 // WebP decoding

--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -14,6 +14,7 @@

 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>

 #ifdef HAVE_CONFIG_H
@ -26,182 +27,11 @@
 #include "webp/encode.h"
 #include "webp/mux.h"
 #include "./example_util.h"
-#include "./gif2webp_util.h"
-
-// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
-#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
-# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
-# define LOCAL_GIF_PREREQ(maj, min) \
-    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_GIF_VERSION 0
-# define LOCAL_GIF_PREREQ(maj, min) 0
-#endif
-
-#define GIF_TRANSPARENT_MASK 0x01
-#define GIF_DISPOSE_MASK     0x07
-#define GIF_DISPOSE_SHIFT    2
-#define WHITE_COLOR          0xffffffff
-#define MAX_CACHE_SIZE       30
+#include "./gifdec.h"

 //------------------------------------------------------------------------------

-static int transparent_index = -1;  // Opaque frame by default.
-
-static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
-                                      size_t* const kmax_ptr) {
-  size_t kmin = *kmin_ptr;
-  size_t kmax = *kmax_ptr;
-  int print_warning = 1;
-
-  if (kmin == 0) {  // Disable keyframe insertion.
-    kmax = ~0;
-    kmin = kmax - 1;
-    print_warning = 0;
-  }
-  if (kmax == 0) {
-    kmax = ~0;
-    print_warning = 0;
-  }
-
-  if (kmin >= kmax) {
-    kmin = kmax - 1;
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmin < kmax.\n", (int)kmin);
-    }
-  } else if (kmin < (kmax / 2 + 1)) {
-    // This ensures that cache.keyframe + kmin >= kmax is always true. So, we
-    // can flush all the frames in the ‘count_since_key_frame == kmax’ case.
-    kmin = (kmax / 2 + 1);
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmin >= kmax / 2 + 1.\n",
-              (int)kmin);
-    }
-  }
-  // Limit the max number of frames that are allocated.
-  if (kmax - kmin > MAX_CACHE_SIZE) {
-    kmin = kmax - MAX_CACHE_SIZE;
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmax - kmin <= 30.\n",
-              (int)kmin);
-    }
-  }
-  *kmin_ptr = kmin;
-  *kmax_ptr = kmax;
-}
-
-static void Remap(const uint8_t* const src, const GifFileType* const gif,
-                  uint32_t* dst, int len) {
-  int i;
-  const GifColorType* colors;
-  const ColorMapObject* const cmap =
-      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
-  if (cmap == NULL) return;
-  colors = cmap->Colors;
-
-  for (i = 0; i < len; ++i) {
-    const GifColorType c = colors[src[i]];
-    dst[i] = (src[i] == transparent_index) ? WEBP_UTIL_TRANSPARENT_COLOR
-           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
-  }
-}
-
-// Read the GIF image frame.
-static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
-                     WebPPicture* const webp_frame) {
-  WebPPicture sub_image;
-  const GifImageDesc* const image_desc = &gif->Image;
-  uint32_t* dst = NULL;
-  uint8_t* tmp = NULL;
-  int ok = 0;
-  WebPFrameRect rect = {
-      image_desc->Left, image_desc->Top, image_desc->Width, image_desc->Height
-  };
-  *gif_rect = rect;
-
-  // Use a view for the sub-picture:
-  if (!WebPPictureView(webp_frame, rect.x_offset, rect.y_offset,
-                       rect.width, rect.height, &sub_image)) {
-    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
-            rect.width, rect.height, rect.x_offset, rect.y_offset);
-    return 0;
-  }
-  dst = sub_image.argb;
-
-  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
-  if (tmp == NULL) goto End;
-
-  if (image_desc->Interlace) {  // Interlaced image.
-    // We need 4 passes, with the following offsets and jumps.
-    const int interlace_offsets[] = { 0, 4, 2, 1 };
-    const int interlace_jumps[]   = { 8, 8, 4, 2 };
-    int pass;
-    for (pass = 0; pass < 4; ++pass) {
-      int y;
-      for (y = interlace_offsets[pass]; y < rect.height;
-           y += interlace_jumps[pass]) {
-        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-        Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
-      }
-    }
-  } else {  // Non-interlaced image.
-    int y;
-    for (y = 0; y < rect.height; ++y) {
-      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-      Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
-    }
-  }
-  ok = 1;
-
- End:
-  if (!ok) webp_frame->error_code = sub_image.error_code;
-  WebPPictureFree(&sub_image);
-  free(tmp);
-  return ok;
-}
-
-static void GetBackgroundColor(const ColorMapObject* const color_map,
-                               int bgcolor_idx, uint32_t* const bgcolor) {
-  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
-    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
-  } else if (color_map == NULL || color_map->Colors == NULL
-             || bgcolor_idx >= color_map->ColorCount) {
-    *bgcolor = WHITE_COLOR;
-    fprintf(stderr,
-            "GIF decode warning: invalid background color index. Assuming "
-            "white background.\n");
-  } else {
-    const GifColorType color = color_map->Colors[bgcolor_idx];
-    *bgcolor = (0xff        << 24)
-             | (color.Red   << 16)
-             | (color.Green <<  8)
-             | (color.Blue  <<  0);
-  }
-}
-
-static void DisplayGifError(const GifFileType* const gif, int gif_error) {
-  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
-#if LOCAL_GIF_PREREQ(4,2)
-#if LOCAL_GIF_PREREQ(5,0)
-  // Static string actually, hence the const char* cast.
-  const char* error_str = (const char*)GifErrorString(
-      (gif == NULL) ? gif_error : gif->Error);
-#else
-  const char* error_str = (const char*)GifErrorString();
-  (void)gif;
-#endif
-  if (error_str == NULL) error_str = "Unknown error";
-  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
-#else
-  (void)gif;
-  fprintf(stderr, "GIFLib Error %d: ", gif_error);
-  PrintGifError();
-  fprintf(stderr, "\n");
-#endif
-}
+static int transparent_index = GIF_INDEX_INVALID;  // Opaque by default.

 static const char* const kErrorMessages[-WEBP_MUX_NOT_ENOUGH_DATA + 1] = {
  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
@ -225,12 +55,16 @@ static void Help(void) {
  printf("Usage:\n");
  printf(" gif2webp [options] gif_file -o webp_file\n");
  printf("Options:\n");
-  printf("  -h / -help  ............ this help\n");
+  printf("  -h / -help ............. this help\n");
  printf("  -lossy ................. encode image using lossy compression\n");
  printf("  -mixed ................. for each frame in the image, pick lossy\n"
         "                           or lossless compression heuristically\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
+  printf("  -min_size .............. minimize output size (default:off)\n"
+         "                           lossless compression by default; can be\n"
+         "                           combined with -q, -m, -lossy or -mixed\n"
+         "                           options\n");
  printf("  -kmin <int> ............ min distance between key frames\n");
  printf("  -kmax <int> ............ max distance between key frames\n");
  printf("  -f <int> ............... filter strength (0=off..100)\n");
@ -257,35 +91,48 @@ int main(int argc, const char *argv[]) {
  const char *in_file = NULL, *out_file = NULL;
  FILE* out = NULL;
  GifFileType* gif = NULL;
+  int frame_duration = 0;
+  int frame_timestamp = 0;
+  GIFDisposeMethod orig_dispose = GIF_DISPOSE_NONE;
+
+  WebPPicture frame;                // Frame rectangle only (not disposed).
+  WebPPicture curr_canvas;          // Not disposed.
+  WebPPicture prev_canvas;          // Disposed.
+
+  WebPAnimEncoder* enc = NULL;
+  WebPAnimEncoderOptions enc_options;
  WebPConfig config;
-  WebPPicture frame;
-  int duration = 0;
-  FrameDisposeMethod orig_dispose = FRAME_DISPOSE_NONE;
-  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
-  WebPFrameCache* cache = NULL;

  int is_first_frame = 1;     // Whether we are processing the first frame.
  int done;
  int c;
  int quiet = 0;
-  WebPMux* mux = NULL;
-  WebPData webp_data = { NULL, 0 };
+  WebPData webp_data;
+
  int keep_metadata = METADATA_XMP;  // ICC not output by default.
-  int stored_icc = 0;  // Whether we have already stored an ICC profile.
-  int stored_xmp = 0;
+  WebPData icc_data;
+  int stored_icc = 0;         // Whether we have already stored an ICC profile.
+  WebPData xmp_data;
+  int stored_xmp = 0;         // Whether we have already stored an XMP profile.
+  int loop_count = 0;
+  int stored_loop_count = 0;  // Whether we have found an explicit loop count.
+  WebPMux* mux = NULL;

  int default_kmin = 1;  // Whether to use default kmin value.
  int default_kmax = 1;
-  size_t kmin = 0;
-  size_t kmax = 0;
-  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.

-  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
+  if (!WebPConfigInit(&config) || !WebPAnimEncoderOptionsInit(&enc_options) ||
+      !WebPPictureInit(&frame) || !WebPPictureInit(&curr_canvas) ||
+      !WebPPictureInit(&prev_canvas)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    return -1;
  }
  config.lossless = 1;  // Use lossless compression by default.

+  WebPDataInit(&webp_data);
+  WebPDataInit(&icc_data);
+  WebPDataInit(&xmp_data);
+
  if (argc == 1) {
    Help();
    return 0;
@ -301,17 +148,19 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-lossy")) {
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-mixed")) {
-      allow_mixed = 1;
+      enc_options.allow_mixed = 1;
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-min_size")) {
+      enc_options.minimize_size = 1;
    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
-      kmax = ExUtilGetUInt(argv[++c], 0, &parse_error);
+      enc_options.kmax = ExUtilGetInt(argv[++c], 0, &parse_error);
      default_kmax = 0;
    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
-      kmin = ExUtilGetUInt(argv[++c], 0, &parse_error);
+      enc_options.kmin = ExUtilGetInt(argv[++c], 0, &parse_error);
      default_kmin = 0;
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
@ -366,8 +215,10 @@ int main(int argc, const char *argv[]) {
      return 0;
    } else if (!strcmp(argv[c], "-quiet")) {
      quiet = 1;
+      enc_options.verbose = 0;
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
+      enc_options.verbose = 1;
    } else if (!strcmp(argv[c], "--")) {
      if (c < argc - 1) in_file = argv[++c];
      break;
@ -387,12 +238,11 @@ int main(int argc, const char *argv[]) {

  // Appropriate default kmin, kmax values for lossy and lossless.
  if (default_kmin) {
-    kmin = config.lossless ? 9 : 3;
+    enc_options.kmin = config.lossless ? 9 : 3;
  }
  if (default_kmax) {
-    kmax = config.lossless ? 17 : 5;
+    enc_options.kmax = config.lossless ? 17 : 5;
  }
-  SanitizeKeyFrameIntervals(&kmin, &kmax);

  if (!WebPValidateConfig(&config)) {
    fprintf(stderr, "Error! Invalid configuration.\n");
@ -413,12 +263,6 @@ int main(int argc, const char *argv[]) {
 #endif
  if (gif == NULL) goto End;

-  mux = WebPMuxNew();
-  if (mux == NULL) {
-    fprintf(stderr, "ERROR: could not create a mux object.\n");
-    goto End;
-  }
-
  // Loop over GIF images
  done = 0;
  do {
@ -427,17 +271,17 @@ int main(int argc, const char *argv[]) {

    switch (type) {
      case IMAGE_DESC_RECORD_TYPE: {
-        WebPFrameRect gif_rect;
+        GIFFrameRect gif_rect;
        GifImageDesc* const image_desc = &gif->Image;

        if (!DGifGetImageDesc(gif)) goto End;

-        // Fix some broken GIF global headers that report
-        // 0 x 0 screen dimension.
        if (is_first_frame) {
          if (verbose) {
            printf("Canvas screen: %d x %d\n", gif->SWidth, gif->SHeight);
          }
+          // Fix some broken GIF global headers that report
+          // 0 x 0 screen dimension.
          if (gif->SWidth == 0 || gif->SHeight == 0) {
            image_desc->Left = 0;
            image_desc->Top = 0;
@ -451,61 +295,62 @@ int main(int argc, const char *argv[]) {
                     gif->SWidth, gif->SHeight);
            }
          }
-#if WEBP_MUX_ABI_VERSION > 0x0101
-          // Set definitive canvas size.
-          err = WebPMuxSetCanvasSize(mux, gif->SWidth, gif->SHeight);
-          if (err != WEBP_MUX_OK) {
-            fprintf(stderr, "Invalid canvas size %d x %d\n",
-                    gif->SWidth, gif->SHeight);
-            goto End;
-          }
-#endif
          // Allocate current buffer.
          frame.width = gif->SWidth;
          frame.height = gif->SHeight;
          frame.use_argb = 1;
          if (!WebPPictureAlloc(&frame)) goto End;
-          WebPUtilClearPic(&frame, NULL);
-
-          // Initialize cache.
-          cache = WebPFrameCacheNew(frame.width, frame.height,
-                                    kmin, kmax, allow_mixed);
-          if (cache == NULL) goto End;
+          GIFClearPic(&frame, NULL);
+          WebPPictureCopy(&frame, &curr_canvas);
+          WebPPictureCopy(&frame, &prev_canvas);

          // Background color.
-          GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
-                             &anim.bgcolor);
+          GIFGetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
+                                transparent_index,
+                                &enc_options.anim_params.bgcolor);
+
+          // Initialize encoder.
+          enc = WebPAnimEncoderNew(curr_canvas.width, curr_canvas.height,
+                                   &enc_options);
+          if (enc == NULL) {
+            fprintf(stderr,
+                    "Error! Could not create encoder object. Possibly due to "
+                    "a memory error.\n");
+            goto End;
+          }
+          is_first_frame = 0;
        }
+
        // Some even more broken GIF can have sub-rect with zero width/height.
        if (image_desc->Width == 0 || image_desc->Height == 0) {
          image_desc->Width = gif->SWidth;
          image_desc->Height = gif->SHeight;
        }

-        if (!ReadFrame(gif, &gif_rect, &frame)) {
+        if (!GIFReadFrame(gif, transparent_index, &gif_rect, &frame)) {
          goto End;
        }
+        // Blend frame rectangle with previous canvas to compose full canvas.
+        // Note that 'curr_canvas' is same as 'prev_canvas' at this point.
+        GIFBlendFrames(&frame, &gif_rect, &curr_canvas);

-        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, orig_dispose,
-                                    duration, &frame)) {
-          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
-          fprintf(stderr, "Error code: %d\n", frame.error_code);
+        if (!WebPAnimEncoderAdd(enc, &curr_canvas, frame_timestamp, &config)) {
+          fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
        }

-        err = WebPFrameCacheFlush(cache, verbose, mux);
-        if (err != WEBP_MUX_OK) {
-          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
-                  ErrorString(err));
-          goto End;
-        }
-        is_first_frame = 0;
+        // Update canvases.
+        GIFDisposeFrame(orig_dispose, &gif_rect, &prev_canvas, &curr_canvas);
+        GIFCopyPixels(&curr_canvas, &prev_canvas);
+
+        // Update timestamp (for next frame).
+        frame_timestamp += frame_duration;

        // In GIF, graphic control extensions are optional for a frame, so we
        // may not get one before reading the next frame. To handle this case,
        // we reset frame properties to reasonable defaults for the next frame.
-        orig_dispose = FRAME_DISPOSE_NONE;
-        duration = 0;
-        transparent_index = -1;  // Opaque frame by default.
+        orig_dispose = GIF_DISPOSE_NONE;
+        frame_duration = 0;
+        transparent_index = GIF_INDEX_INVALID;
        break;
      }
      case EXTENSION_RECORD_TYPE: {
@ -519,25 +364,10 @@ int main(int argc, const char *argv[]) {
            break;  // Do nothing for now.
          }
          case GRAPHICS_EXT_FUNC_CODE: {
-            const int flags = data[1];
-            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
-            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
-            if (data[0] != 4) goto End;
-            duration = delay * 10;  // Duration is in 1 ms units for WebP.
-            switch (dispose) {
-              case 3:
-                orig_dispose = FRAME_DISPOSE_RESTORE_PREVIOUS;
-                break;
-              case 2:
-                orig_dispose = FRAME_DISPOSE_BACKGROUND;
-                break;
-              case 1:
-              case 0:
-              default:
-                orig_dispose = FRAME_DISPOSE_NONE;
-                break;
+            if (!GIFReadGraphicsExtension(data, &frame_duration, &orig_dispose,
+                                          &transparent_index)) {
+              goto End;
            }
-            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
            break;
          }
          case PLAINTEXT_EXT_FUNC_CODE: {
@ -547,14 +377,13 @@ int main(int argc, const char *argv[]) {
            if (data[0] != 11) break;    // Chunk is too short
            if (!memcmp(data + 1, "NETSCAPE2.0", 11) ||
                !memcmp(data + 1, "ANIMEXTS1.0", 11)) {
-              // Recognize and parse Netscape2.0 NAB extension for loop count.
-              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
-              if (data == NULL) goto End;  // Loop count sub-block missing.
-              if (data[0] < 3 || data[1] != 1) break;   // wrong size/marker
-              anim.loop_count = data[2] | (data[3] << 8);
-              if (verbose) {
-                fprintf(stderr, "Loop count: %d\n", anim.loop_count);
+              if (!GIFReadLoopCount(gif, &data, &loop_count)) {
+                goto End;
              }
+              if (verbose) {
+                fprintf(stderr, "Loop count: %d\n", loop_count);
+              }
+              stored_loop_count = (loop_count != 0);
            } else {  // An extension containing metadata.
              // We only store the first encountered chunk of each type, and
              // only if requested by the user.
@ -565,56 +394,8 @@ int main(int argc, const char *argv[]) {
                                 !stored_icc &&
                                 !memcmp(data + 1, "ICCRGBG1012", 11);
              if (is_xmp || is_icc) {
-                const char* const fourccs[2] = { "XMP " , "ICCP" };
-                const char* const features[2] = { "XMP" , "ICC" };
-                WebPData metadata = { NULL, 0 };
-                // Construct metadata from sub-blocks.
-                // Usual case (including ICC profile): In each sub-block, the
-                // first byte specifies its size in bytes (0 to 255) and the
-                // rest of the bytes contain the data.
-                // Special case for XMP data: In each sub-block, the first byte
-                // is also part of the XMP payload. XMP in GIF also has a 257
-                // byte padding data. See the XMP specification for details.
-                while (1) {
-                  WebPData prev_metadata = metadata;
-                  WebPData subblock;
-                  if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) {
-                    WebPDataClear(&metadata);
-                    goto End;
-                  }
-                  if (data == NULL) break;  // Finished.
-                  subblock.size = is_xmp ? data[0] + 1 : data[0];
-                  assert(subblock.size > 0);
-                  subblock.bytes = is_xmp ? data : data + 1;
-                  metadata.bytes =
-                      (uint8_t*)realloc((void*)metadata.bytes,
-                                        prev_metadata.size + subblock.size);
-                  if (metadata.bytes == NULL) {
-                    WebPDataClear(&prev_metadata);
-                    goto End;
-                  }
-                  metadata.size += subblock.size;
-                  memcpy((void*)(metadata.bytes + prev_metadata.size),
-                         subblock.bytes, subblock.size);
-                }
-                if (is_xmp) {
-                  // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
-                  const size_t xmp_pading_size = 257;
-                  if (metadata.size > xmp_pading_size) {
-                    metadata.size -= xmp_pading_size;
-                  }
-                }
-
-                // Add metadata chunk.
-                err = WebPMuxSetChunk(mux, fourccs[is_icc], &metadata, 1);
-                if (verbose) {
-                  fprintf(stderr, "%s size: %d\n",
-                          features[is_icc], (int)metadata.size);
-                }
-                WebPDataClear(&metadata);
-                if (err != WEBP_MUX_OK) {
-                  fprintf(stderr, "ERROR (%s): Could not set %s chunk.\n",
-                          ErrorString(err), features[is_icc]);
+                if (!GIFReadMetadata(gif, &data,
+                                     is_xmp ? &xmp_data : &icc_data)) {
                  goto End;
                }
                if (is_icc) {
@ -648,38 +429,88 @@ int main(int argc, const char *argv[]) {
    }
  } while (!done);

-  // Flush any pending frames.
-  err = WebPFrameCacheFlushAll(cache, verbose, mux);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
-            ErrorString(err));
+  // Last NULL frame.
+  if (!WebPAnimEncoderAdd(enc, NULL, frame_timestamp, NULL)) {
+    fprintf(stderr, "Error flushing WebP muxer.\n");
+    fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
+  }
+
+  if (!WebPAnimEncoderAssemble(enc, &webp_data)) {
+    fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
    goto End;
  }

-  // Finish muxing
-  err = WebPMuxSetAnimationParams(mux, &anim);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s): Could not set animation parameters.\n",
-            ErrorString(err));
-    goto End;
+  if (stored_loop_count || stored_icc || stored_xmp) {
+    // Re-mux to add loop count and/or metadata as needed.
+    mux = WebPMuxCreate(&webp_data, 1);
+    if (mux == NULL) {
+      fprintf(stderr, "ERROR: Could not re-mux to add loop count/metadata.\n");
+      goto End;
+    }
+    WebPDataClear(&webp_data);
+
+    if (stored_loop_count) {  // Update loop count.
+      WebPMuxAnimParams new_params;
+      err = WebPMuxGetAnimationParams(mux, &new_params);
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not fetch loop count.\n",
+                ErrorString(err));
+        goto End;
+      }
+      new_params.loop_count = loop_count;
+      err = WebPMuxSetAnimationParams(mux, &new_params);
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not update loop count.\n",
+                ErrorString(err));
+        goto End;
+      }
+    }
+
+    if (stored_icc) {   // Add ICCP chunk.
+      err = WebPMuxSetChunk(mux, "ICCP", &icc_data, 1);
+      if (verbose) {
+        fprintf(stderr, "ICC size: %d\n", (int)icc_data.size);
+      }
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not set ICC chunk.\n",
+                ErrorString(err));
+        goto End;
+      }
+    }
+
+    if (stored_xmp) {   // Add XMP chunk.
+      err = WebPMuxSetChunk(mux, "XMP ", &xmp_data, 1);
+      if (verbose) {
+        fprintf(stderr, "XMP size: %d\n", (int)xmp_data.size);
+      }
+      if (err != WEBP_MUX_OK) {
+        fprintf(stderr, "ERROR (%s): Could not set XMP chunk.\n",
+                ErrorString(err));
+        goto End;
+      }
+    }
+
+    err = WebPMuxAssemble(mux, &webp_data);
+    if (err != WEBP_MUX_OK) {
+      fprintf(stderr, "ERROR (%s): Could not assemble when re-muxing to add "
+              "loop count/metadata.\n", ErrorString(err));
+      goto End;
+    }
  }

-  err = WebPMuxAssemble(mux, &webp_data);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s) assembling the WebP file.\n", ErrorString(err));
-    goto End;
-  }
  if (out_file != NULL) {
    if (!ExUtilWriteFile(out_file, webp_data.bytes, webp_data.size)) {
      fprintf(stderr, "Error writing output file: %s\n", out_file);
      goto End;
    }
    if (!quiet) {
-      fprintf(stderr, "Saved output file: %s\n", out_file);
+      fprintf(stderr, "Saved output file (%d bytes): %s\n",
+              (int)webp_data.size, out_file);
    }
  } else {
    if (!quiet) {
-      fprintf(stderr, "Nothing written; use -o flag to save the result.\n");
+      fprintf(stderr, "Nothing written; use -o flag to save the result "
+                      "(%d bytes).\n", (int)webp_data.size);
    }
  }

@ -688,14 +519,18 @@ int main(int argc, const char *argv[]) {
  gif_error = GIF_OK;

 End:
-  WebPDataClear(&webp_data);
+  WebPDataClear(&icc_data);
+  WebPDataClear(&xmp_data);
  WebPMuxDelete(mux);
+  WebPDataClear(&webp_data);
  WebPPictureFree(&frame);
-  WebPFrameCacheDelete(cache);
+  WebPPictureFree(&curr_canvas);
+  WebPPictureFree(&prev_canvas);
+  WebPAnimEncoderDelete(enc);
  if (out != NULL && out_file != NULL) fclose(out);

  if (gif_error != GIF_OK) {
-    DisplayGifError(gif, gif_error);
+    GIFDisplayError(gif, gif_error);
  }
  if (gif != NULL) {
 #if LOCAL_GIF_PREREQ(5,1)
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -1,88 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Helper structs and methods for gif2webp tool.
-//
-// Author: Urvang (urvang@google.com)
-
-#ifndef WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
-#define WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
-
-#include <stdlib.h>
-
-#include "webp/mux.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-// Helper utilities.
-
-#define WEBP_UTIL_TRANSPARENT_COLOR 0x00ffffff
-
-struct WebPPicture;
-
-// Includes all disposal methods, even the ones not supported by WebP bitstream.
-typedef enum FrameDisposeMethod {
-  FRAME_DISPOSE_NONE,
-  FRAME_DISPOSE_BACKGROUND,
-  FRAME_DISPOSE_RESTORE_PREVIOUS
-} FrameDisposeMethod;
-
-typedef struct {
-  int x_offset, y_offset, width, height;
-} WebPFrameRect;
-
-// Clear pixels in 'picture' within given 'rect' to transparent color.
-void WebPUtilClearPic(struct WebPPicture* const picture,
-                      const WebPFrameRect* const rect);
-
-//------------------------------------------------------------------------------
-// Frame cache.
-
-typedef struct WebPFrameCache WebPFrameCache;
-
-// Given the minimum distance between key frames 'kmin' and maximum distance
-// between key frames 'kmax', returns an appropriately allocated cache object.
-// If 'allow_mixed' is true, the subsequent calls to WebPFrameCacheAddFrame()
-// will heuristically pick lossy or lossless compression for each frame.
-// Use WebPFrameCacheDelete() to deallocate the 'cache'.
-WebPFrameCache* WebPFrameCacheNew(int width, int height,
-                                  size_t kmin, size_t kmax, int allow_mixed);
-
-// Release all the frame data from 'cache' and free 'cache'.
-void WebPFrameCacheDelete(WebPFrameCache* const cache);
-
-// Given an image described by 'frame', 'rect', 'dispose_method' and 'duration',
-// optimize it for WebP, encode it and add it to 'cache'. 'rect' can be NULL.
-// This takes care of frame disposal too, according to 'dispose_method'.
-// Returns false in case of error (and sets frame->error_code accordingly).
-int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
-                           const WebPConfig* const config,
-                           const WebPFrameRect* const rect,
-                           FrameDisposeMethod dispose_method, int duration,
-                           WebPPicture* const frame);
-
-// Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
-// true, prints the information about these frames.
-WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
-                                 WebPMux* const mux);
-
-// Similar to 'WebPFrameCacheFlushFrames()', but flushes *all* the frames.
-WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
-                                    WebPMux* const mux);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  // WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
--- a/examples/gifdec.c
+++ b/examples/gifdec.c
@ -0,0 +1,396 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// GIF decode.
+
+#include "./gifdec.h"
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_GIF
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "webp/mux_types.h"
+
+#define GIF_TRANSPARENT_COLOR 0x00ffffff
+#define GIF_WHITE_COLOR       0xffffffff
+#define GIF_TRANSPARENT_MASK  0x01
+#define GIF_DISPOSE_MASK      0x07
+#define GIF_DISPOSE_SHIFT     2
+
+// from utils/utils.h
+extern void WebPCopyPlane(const uint8_t* src, int src_stride,
+                          uint8_t* dst, int dst_stride,
+                          int width, int height);
+extern void WebPCopyPixels(const WebPPicture* const src,
+                           WebPPicture* const dst);
+
+void GIFGetBackgroundColor(const ColorMapObject* const color_map,
+                           int bgcolor_index, int transparent_index,
+                           uint32_t* const bgcolor) {
+  if (transparent_index != GIF_INDEX_INVALID &&
+      bgcolor_index == transparent_index) {
+    *bgcolor = GIF_TRANSPARENT_COLOR;  // Special case.
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || bgcolor_index >= color_map->ColorCount) {
+    *bgcolor = GIF_WHITE_COLOR;
+    fprintf(stderr,
+            "GIF decode warning: invalid background color index. Assuming "
+            "white background.\n");
+  } else {
+    const GifColorType color = color_map->Colors[bgcolor_index];
+    *bgcolor = (0xff        << 24)
+             | (color.Red   << 16)
+             | (color.Green <<  8)
+             | (color.Blue  <<  0);
+  }
+}
+
+int GIFReadGraphicsExtension(const GifByteType* const buf, int* const duration,
+                             GIFDisposeMethod* const dispose,
+                             int* const transparent_index) {
+  const int flags = buf[1];
+  const int dispose_raw = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
+  const int duration_raw = buf[2] | (buf[3] << 8);  // In 10 ms units.
+  if (buf[0] != 4) return 0;
+  *duration = duration_raw * 10;  // Duration is in 1 ms units.
+  switch (dispose_raw) {
+    case 3:
+      *dispose = GIF_DISPOSE_RESTORE_PREVIOUS;
+      break;
+    case 2:
+      *dispose = GIF_DISPOSE_BACKGROUND;
+      break;
+    case 1:
+    case 0:
+    default:
+      *dispose = GIF_DISPOSE_NONE;
+      break;
+  }
+  *transparent_index =
+      (flags & GIF_TRANSPARENT_MASK) ? buf[4] : GIF_INDEX_INVALID;
+  return 1;
+}
+
+static void Remap(const GifFileType* const gif, const uint8_t* const src,
+                  int len, int transparent_index, uint32_t* dst) {
+  int i;
+  const GifColorType* colors;
+  const ColorMapObject* const cmap =
+      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
+  if (cmap == NULL) return;
+  colors = cmap->Colors;
+
+  for (i = 0; i < len; ++i) {
+    const GifColorType c = colors[src[i]];
+    dst[i] = (src[i] == transparent_index) ? GIF_TRANSPARENT_COLOR
+           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
+  }
+}
+
+int GIFReadFrame(GifFileType* const gif, int transparent_index,
+                 GIFFrameRect* const gif_rect, WebPPicture* const picture) {
+  WebPPicture sub_image;
+  const GifImageDesc* const image_desc = &gif->Image;
+  uint32_t* dst = NULL;
+  uint8_t* tmp = NULL;
+  int ok = 0;
+  GIFFrameRect rect = {
+      image_desc->Left, image_desc->Top, image_desc->Width, image_desc->Height
+  };
+  *gif_rect = rect;
+
+  // Use a view for the sub-picture:
+  if (!WebPPictureView(picture, rect.x_offset, rect.y_offset,
+                       rect.width, rect.height, &sub_image)) {
+    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
+            rect.width, rect.height, rect.x_offset, rect.y_offset);
+    return 0;
+  }
+  dst = sub_image.argb;
+
+  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
+  if (tmp == NULL) goto End;
+
+  if (image_desc->Interlace) {  // Interlaced image.
+    // We need 4 passes, with the following offsets and jumps.
+    const int interlace_offsets[] = { 0, 4, 2, 1 };
+    const int interlace_jumps[]   = { 8, 8, 4, 2 };
+    int pass;
+    for (pass = 0; pass < 4; ++pass) {
+      int y;
+      for (y = interlace_offsets[pass]; y < rect.height;
+           y += interlace_jumps[pass]) {
+        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+        Remap(gif, tmp, rect.width, transparent_index,
+              dst + y * sub_image.argb_stride);
+      }
+    }
+  } else {  // Non-interlaced image.
+    int y;
+    for (y = 0; y < rect.height; ++y) {
+      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+      Remap(gif, tmp, rect.width, transparent_index,
+            dst + y * sub_image.argb_stride);
+    }
+  }
+  ok = 1;
+
+ End:
+  if (!ok) picture->error_code = sub_image.error_code;
+  WebPPictureFree(&sub_image);
+  free(tmp);
+  return ok;
+}
+
+int GIFReadLoopCount(GifFileType* const gif, GifByteType** const buf,
+                     int* const loop_count) {
+  assert(!memcmp(*buf + 1, "NETSCAPE2.0", 11) ||
+         !memcmp(*buf + 1, "ANIMEXTS1.0", 11));
+  if (DGifGetExtensionNext(gif, buf) == GIF_ERROR) {
+    return 0;
+  }
+  if (*buf == NULL) {
+    return 0;  // Loop count sub-block missing.
+  }
+  if ((*buf)[0] < 3 || (*buf)[1] != 1) {
+    return 0;   // wrong size/marker
+  }
+  *loop_count = (*buf)[2] | ((*buf)[3] << 8);
+  return 1;
+}
+
+int GIFReadMetadata(GifFileType* const gif, GifByteType** const buf,
+                    WebPData* const metadata) {
+  const int is_xmp = !memcmp(*buf + 1, "XMP DataXMP", 11);
+  const int is_icc = !memcmp(*buf + 1, "ICCRGBG1012", 11);
+  assert(is_xmp || is_icc);
+  (void)is_icc;  // silence unused warning.
+  // Construct metadata from sub-blocks.
+  // Usual case (including ICC profile): In each sub-block, the
+  // first byte specifies its size in bytes (0 to 255) and the
+  // rest of the bytes contain the data.
+  // Special case for XMP data: In each sub-block, the first byte
+  // is also part of the XMP payload. XMP in GIF also has a 257
+  // byte padding data. See the XMP specification for details.
+  while (1) {
+    WebPData subblock;
+    const uint8_t* tmp;
+    if (DGifGetExtensionNext(gif, buf) == GIF_ERROR) {
+      return 0;
+    }
+    if (*buf == NULL) break;  // Finished.
+    subblock.size = is_xmp ? (*buf)[0] + 1 : (*buf)[0];
+    assert(subblock.size > 0);
+    subblock.bytes = is_xmp ? *buf : *buf + 1;
+    // Note: We store returned value in 'tmp' first, to avoid
+    // leaking old memory in metadata->bytes on error.
+    tmp = (uint8_t*)realloc((void*)metadata->bytes,
+                            metadata->size + subblock.size);
+    if (tmp == NULL) {
+      return 0;
+    }
+    memcpy((void*)(tmp + metadata->size),
+           subblock.bytes, subblock.size);
+    metadata->bytes = tmp;
+    metadata->size += subblock.size;
+  }
+  if (is_xmp) {
+    // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
+    const size_t xmp_pading_size = 257;
+    if (metadata->size > xmp_pading_size) {
+      metadata->size -= xmp_pading_size;
+    }
+  }
+  return 1;
+}
+
+static void ClearRectangle(WebPPicture* const picture,
+                           int left, int top, int width, int height) {
+  int j;
+  for (j = top; j < top + height; ++j) {
+    uint32_t* const dst = picture->argb + j * picture->argb_stride;
+    int i;
+    for (i = left; i < left + width; ++i) {
+      dst[i] = GIF_TRANSPARENT_COLOR;
+    }
+  }
+}
+
+void GIFClearPic(WebPPicture* const pic, const GIFFrameRect* const rect) {
+  if (rect != NULL) {
+    ClearRectangle(pic, rect->x_offset, rect->y_offset,
+                   rect->width, rect->height);
+  } else {
+    ClearRectangle(pic, 0, 0, pic->width, pic->height);
+  }
+}
+
+void GIFCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
+  WebPCopyPixels(src, dst);
+}
+
+void GIFDisposeFrame(GIFDisposeMethod dispose, const GIFFrameRect* const rect,
+                     const WebPPicture* const prev_canvas,
+                     WebPPicture* const curr_canvas) {
+  assert(rect != NULL);
+  if (dispose == GIF_DISPOSE_BACKGROUND) {
+    GIFClearPic(curr_canvas, rect);
+  } else if (dispose == GIF_DISPOSE_RESTORE_PREVIOUS) {
+    const int src_stride = prev_canvas->argb_stride;
+    const uint32_t* const src =
+        prev_canvas->argb + rect->x_offset + rect->y_offset * src_stride;
+    const int dst_stride = curr_canvas->argb_stride;
+    uint32_t* const dst =
+        curr_canvas->argb + rect->x_offset + rect->y_offset * dst_stride;
+    assert(prev_canvas != NULL);
+    WebPCopyPlane((uint8_t*)src, 4 * src_stride, (uint8_t*)dst, 4 * dst_stride,
+                  4 * rect->width, rect->height);
+  }
+}
+
+void GIFBlendFrames(const WebPPicture* const src,
+                    const GIFFrameRect* const rect, WebPPicture* const dst) {
+  int j;
+  assert(src->width == dst->width && src->height == dst->height);
+  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
+    int i;
+    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
+      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
+      const int src_alpha = src_pixel >> 24;
+      if (src_alpha != 0) {
+        dst->argb[j * dst->argb_stride + i] = src_pixel;
+      }
+    }
+  }
+}
+
+void GIFDisplayError(const GifFileType* const gif, int gif_error) {
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if LOCAL_GIF_PREREQ(4,2)
+#if LOCAL_GIF_PREREQ(5,0)
+  // Static string actually, hence the const char* cast.
+  const char* error_str = (const char*)GifErrorString(
+      (gif == NULL) ? gif_error : gif->Error);
+#else
+  const char* error_str = (const char*)GifErrorString();
+  (void)gif;
+#endif
+  if (error_str == NULL) error_str = "Unknown error";
+  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+  (void)gif;
+  fprintf(stderr, "GIFLib Error %d: ", gif_error);
+  PrintGifError();
+  fprintf(stderr, "\n");
+#endif
+}
+
+#else  // !WEBP_HAVE_GIF
+
+static void ErrorGIFNotAvailable() {
+  fprintf(stderr, "GIF support not compiled. Please install the libgif-dev "
+          "package before building.\n");
+}
+
+void GIFGetBackgroundColor(const struct ColorMapObject* const color_map,
+                           int bgcolor_index, int transparent_index,
+                           uint32_t* const bgcolor) {
+  (void)color_map;
+  (void)bgcolor_index;
+  (void)transparent_index;
+  (void)bgcolor;
+  ErrorGIFNotAvailable();
+}
+
+int GIFReadGraphicsExtension(const GifByteType* const data, int* const duration,
+                             GIFDisposeMethod* const dispose,
+                             int* const transparent_index) {
+  (void)data;
+  (void)duration;
+  (void)dispose;
+  (void)transparent_index;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+int GIFReadFrame(struct GifFileType* const gif, int transparent_index,
+                 GIFFrameRect* const gif_rect,
+                 struct WebPPicture* const picture) {
+  (void)gif;
+  (void)transparent_index;
+  (void)gif_rect;
+  (void)picture;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+int GIFReadLoopCount(struct GifFileType* const gif, GifByteType** const buf,
+                     int* const loop_count) {
+  (void)gif;
+  (void)buf;
+  (void)loop_count;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+int GIFReadMetadata(struct GifFileType* const gif, GifByteType** const buf,
+                    struct WebPData* const metadata) {
+  (void)gif;
+  (void)buf;
+  (void)metadata;
+  ErrorGIFNotAvailable();
+  return 0;
+}
+
+void GIFDisposeFrame(GIFDisposeMethod dispose, const GIFFrameRect* const rect,
+                     const struct WebPPicture* const prev_canvas,
+                     struct WebPPicture* const curr_canvas) {
+  (void)dispose;
+  (void)rect;
+  (void)prev_canvas;
+  (void)curr_canvas;
+  ErrorGIFNotAvailable();
+}
+
+void GIFBlendFrames(const struct WebPPicture* const src,
+                    const GIFFrameRect* const rect,
+                    struct WebPPicture* const dst) {
+  (void)src;
+  (void)rect;
+  (void)dst;
+  ErrorGIFNotAvailable();
+}
+
+void GIFDisplayError(const struct GifFileType* const gif, int gif_error) {
+  (void)gif;
+  (void)gif_error;
+  ErrorGIFNotAvailable();
+}
+
+void GIFClearPic(struct WebPPicture* const pic,
+                 const GIFFrameRect* const rect) {
+  (void)pic;
+  (void)rect;
+  ErrorGIFNotAvailable();
+}
+
+void GIFCopyPixels(const struct WebPPicture* const src,
+                   struct WebPPicture* const dst) {
+  (void)src;
+  (void)dst;
+  ErrorGIFNotAvailable();
+}
+
+#endif  // WEBP_HAVE_GIF
+
+// -----------------------------------------------------------------------------
--- a/examples/gifdec.h
+++ b/examples/gifdec.h
@ -0,0 +1,116 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// GIF decode.
+
+#ifndef WEBP_EXAMPLES_GIFDEC_H_
+#define WEBP_EXAMPLES_GIFDEC_H_
+
+#include <stdio.h>
+#include "webp/types.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#ifdef WEBP_HAVE_GIF
+#include <gif_lib.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR)
+# define LOCAL_GIF_VERSION ((GIFLIB_MAJOR << 8) | GIFLIB_MINOR)
+# define LOCAL_GIF_PREREQ(maj, min) \
+    (LOCAL_GIF_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GIF_VERSION 0
+# define LOCAL_GIF_PREREQ(maj, min) 0
+#endif
+
+#define GIF_INDEX_INVALID (-1)
+
+typedef enum GIFDisposeMethod {
+  GIF_DISPOSE_NONE,
+  GIF_DISPOSE_BACKGROUND,
+  GIF_DISPOSE_RESTORE_PREVIOUS
+} GIFDisposeMethod;
+
+typedef struct {
+  int x_offset, y_offset, width, height;
+} GIFFrameRect;
+
+struct WebPData;
+struct WebPPicture;
+
+#ifndef WEBP_HAVE_GIF
+struct ColorMapObject;
+struct GifFileType;
+typedef unsigned char GifByteType;
+#endif
+
+// Given the index of background color and transparent color, returns the
+// corresponding background color (in BGRA format) in 'bgcolor'.
+void GIFGetBackgroundColor(const struct ColorMapObject* const color_map,
+                           int bgcolor_index, int transparent_index,
+                           uint32_t* const bgcolor);
+
+// Parses the given graphics extension data to get frame duration (in 1ms
+// units), dispose method and transparent color index.
+// Returns true on success.
+int GIFReadGraphicsExtension(const GifByteType* const buf, int* const duration,
+                             GIFDisposeMethod* const dispose,
+                             int* const transparent_index);
+
+// Reads the next GIF frame from 'gif' into 'picture'. Also, returns the GIF
+// frame dimensions and offsets in 'rect'.
+// Returns true on success.
+int GIFReadFrame(struct GifFileType* const gif, int transparent_index,
+                 GIFFrameRect* const gif_rect,
+                 struct WebPPicture* const picture);
+
+// Parses loop count from the given Netscape extension data.
+int GIFReadLoopCount(struct GifFileType* const gif, GifByteType** const buf,
+                     int* const loop_count);
+
+// Parses the given ICC or XMP extension data and stores it into 'metadata'.
+// Returns true on success.
+int GIFReadMetadata(struct GifFileType* const gif, GifByteType** const buf,
+                    struct WebPData* const metadata);
+
+// Dispose the pixels within 'rect' of 'curr_canvas' based on 'dispose' method
+// and 'prev_canvas'.
+void GIFDisposeFrame(GIFDisposeMethod dispose, const GIFFrameRect* const rect,
+                     const struct WebPPicture* const prev_canvas,
+                     struct WebPPicture* const curr_canvas);
+
+// Given 'src' picture and its frame rectangle 'rect', blend it into 'dst'.
+void GIFBlendFrames(const struct WebPPicture* const src,
+                    const GIFFrameRect* const rect,
+                    struct WebPPicture* const dst);
+
+// Prints an error string based on 'gif_error'.
+void GIFDisplayError(const struct GifFileType* const gif, int gif_error);
+
+// In the given 'pic', clear the pixels in 'rect' to transparent color.
+void GIFClearPic(struct WebPPicture* const pic, const GIFFrameRect* const rect);
+
+// Copy pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are assumed
+// to be already allocated.
+void GIFCopyPixels(const struct WebPPicture* const src,
+                   struct WebPPicture* const dst);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_GIFDEC_H_
--- a/examples/image_dec.c
+++ b/examples/image_dec.c
@ -0,0 +1,46 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Generic image-type guessing.
+
+#include "./image_dec.h"
+
+static WEBP_INLINE uint32_t GetBE32(const uint8_t buf[]) {
+  return ((uint32_t)buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
+}
+
+WebPInputFileFormat WebPGuessImageType(const uint8_t* const data,
+                                       size_t data_size) {
+  WebPInputFileFormat format = WEBP_UNSUPPORTED_FORMAT;
+  if (data != NULL && data_size >= 12) {
+    const uint32_t magic1 = GetBE32(data + 0);
+    const uint32_t magic2 = GetBE32(data + 8);
+    if (magic1 == 0x89504E47U) {
+      format = WEBP_PNG_FORMAT;
+    } else if (magic1 >= 0xFFD8FF00U && magic1 <= 0xFFD8FFFFU) {
+      format = WEBP_JPEG_FORMAT;
+    } else if (magic1 == 0x49492A00 || magic1 == 0x4D4D002A) {
+      format = WEBP_TIFF_FORMAT;
+    } else if (magic1 == 0x52494646 && magic2 == 0x57454250) {
+      format = WEBP_WEBP_FORMAT;
+    }
+  }
+  return format;
+}
+
+WebPImageReader WebPGuessImageReader(const uint8_t* const data,
+                                     size_t data_size) {
+  switch (WebPGuessImageType(data, data_size)) {
+    case WEBP_PNG_FORMAT: return ReadPNG;
+    case WEBP_JPEG_FORMAT: return ReadJPEG;
+    case WEBP_TIFF_FORMAT: return ReadTIFF;
+    case WEBP_WEBP_FORMAT: return ReadWebP;
+    default: return NULL;
+  }
+}
--- a/examples/image_dec.h
+++ b/examples/image_dec.h
@ -0,0 +1,61 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  All-in-one library to decode PNG/JPEG/WebP/TIFF/WIC input images.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_EXAMPLES_IMAGE_DEC_H_
+#define WEBP_EXAMPLES_IMAGE_DEC_H_
+
+#include "webp/types.h"
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#include "./metadata.h"
+#include "./jpegdec.h"
+#include "./pngdec.h"
+#include "./tiffdec.h"
+#include "./webpdec.h"
+#include "./wicdec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  WEBP_PNG_FORMAT = 0,
+  WEBP_JPEG_FORMAT,
+  WEBP_TIFF_FORMAT,
+  WEBP_WEBP_FORMAT,
+  WEBP_UNSUPPORTED_FORMAT
+} WebPInputFileFormat;
+
+// Try to infer the image format. 'data_size' should be larger than 12.
+// Returns WEBP_UNSUPPORTED_FORMAT if format can't be guess safely.
+WebPInputFileFormat WebPGuessImageType(const uint8_t* const data,
+                                       size_t data_size);
+
+// Signature for common image-reading functions (ReadPNG, ReadJPEG, ...)
+typedef int (*WebPImageReader)(const uint8_t* const data, size_t data_size,
+                               struct WebPPicture* const pic,
+                               int keep_alpha, struct Metadata* const metadata);
+
+// This function is similar to WebPGuessImageType(), but returns a
+// suitable reader function. Or NULL if the image can't be guessed.
+WebPImageReader WebPGuessImageReader(const uint8_t* const data,
+                                     size_t data_size);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_IMAGE_DEC_H_
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@ -19,11 +19,13 @@

 #ifdef WEBP_HAVE_JPEG
 #include <jpeglib.h>
+#include <jerror.h>
 #include <setjmp.h>
 #include <stdlib.h>
 #include <string.h>

 #include "webp/encode.h"
+#include "./example_util.h"
 #include "./metadata.h"

 // -----------------------------------------------------------------------------
@ -208,13 +210,65 @@ static void my_error_exit(j_common_ptr dinfo) {
  longjmp(myerr->setjmp_buffer, 1);
 }

-int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
-  int ok = 0;
+typedef struct {
+  struct jpeg_source_mgr pub;
+  const uint8_t* data;
+  size_t data_size;
+} JPEGReadContext;
+
+static void ContextInit(j_decompress_ptr cinfo) {
+  JPEGReadContext* const ctx = (JPEGReadContext*)cinfo->src;
+  ctx->pub.next_input_byte = ctx->data;
+  ctx->pub.bytes_in_buffer = ctx->data_size;
+}
+
+static int ContextFill(j_decompress_ptr cinfo) {
+  // we shouldn't get here.
+  ERREXIT(cinfo, JERR_FILE_READ);
+  return 0;
+}
+
+static void ContextSkip(j_decompress_ptr cinfo, long jump_size) {
+  JPEGReadContext* const ctx = (JPEGReadContext*)cinfo->src;
+  size_t jump = (size_t)jump_size;
+  if (jump > ctx->pub.bytes_in_buffer) {  // Don't overflow the buffer.
+    jump = ctx->pub.bytes_in_buffer;
+  }
+  ctx->pub.bytes_in_buffer -= jump;
+  ctx->pub.next_input_byte += jump;
+}
+
+static void ContextTerm(j_decompress_ptr cinfo) {
+  (void)cinfo;
+}
+
+static void ContextSetup(volatile struct jpeg_decompress_struct* const cinfo,
+                         JPEGReadContext* const ctx) {
+  cinfo->src = (struct jpeg_source_mgr*)ctx;
+  ctx->pub.init_source = ContextInit;
+  ctx->pub.fill_input_buffer = ContextFill;
+  ctx->pub.skip_input_data = ContextSkip;
+  ctx->pub.resync_to_restart = jpeg_resync_to_restart;
+  ctx->pub.term_source = ContextTerm;
+  ctx->pub.bytes_in_buffer = 0;
+  ctx->pub.next_input_byte = NULL;
+}
+
+int ReadJPEG(const uint8_t* const data, size_t data_size,
+             WebPPicture* const pic, int keep_alpha,
+             Metadata* const metadata) {
+  volatile int ok = 0;
  int stride, width, height;
  volatile struct jpeg_decompress_struct dinfo;
  struct my_error_mgr jerr;
  uint8_t* volatile rgb = NULL;
  JSAMPROW buffer[1];
+  JPEGReadContext ctx;
+
+  (void)keep_alpha;
+  memset(&ctx, 0, sizeof(ctx));
+  ctx.data = data;
+  ctx.data_size = data_size;

  memset((j_decompress_ptr)&dinfo, 0, sizeof(dinfo));   // for setjmp sanity
  dinfo.err = jpeg_std_error(&jerr.pub);
@ -228,7 +282,7 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  }

  jpeg_create_decompress((j_decompress_ptr)&dinfo);
-  jpeg_stdio_src((j_decompress_ptr)&dinfo, in_file);
+  ContextSetup(&dinfo, &ctx);
  if (metadata != NULL) SaveMetadataMarkers((j_decompress_ptr)&dinfo);
  jpeg_read_header((j_decompress_ptr)&dinfo, TRUE);

@ -272,7 +326,6 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  // WebP conversion.
  pic->width = width;
  pic->height = height;
-  pic->use_argb = 1;      // store raw RGB samples
  ok = WebPPictureImportRGB(pic, rgb, stride);
  if (!ok) goto Error;

@ -281,10 +334,13 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  return ok;
 }
 #else  // !WEBP_HAVE_JPEG
-int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+int ReadJPEG(const uint8_t* const data, size_t data_size,
+             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata) {
-  (void)in_file;
+  (void)data;
+  (void)data_size;
  (void)pic;
+  (void)keep_alpha;
  (void)metadata;
  fprintf(stderr, "JPEG support not compiled. Please install the libjpeg "
          "development package before building.\n");
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@ -22,10 +22,13 @@ extern "C" {
 struct Metadata;
 struct WebPPicture;

-// Reads a JPEG from 'in_file', returning the decoded output in 'pic'.
-// The output is RGB.
+// Reads a JPEG from 'data', returning the decoded output in 'pic'.
+// The output is RGB or YUV depending on pic->use_argb value.
 // Returns true on success.
-int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+// 'keep_alpha' has no effect, but is kept for coherence with other signatures
+// for image readers.
+int ReadJPEG(const uint8_t* const data, size_t data_size,
+             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata);

 #ifdef __cplusplus
--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@ -24,6 +24,7 @@
 #include <string.h>

 #include "webp/encode.h"
+#include "./example_util.h"
 #include "./metadata.h"

 static void PNGAPI error_function(png_structp png, png_const_charp error) {
@ -131,8 +132,8 @@ static int ExtractMetadataFromPNG(png_structp png,
  for (p = 0; p < 2; ++p)  {
    png_infop const info = (p == 0) ? head_info : end_info;
    png_textp text = NULL;
-    const int num = png_get_text(png, info, &text, NULL);
-    int i;
+    const png_uint_32 num = png_get_text(png, info, &text, NULL);
+    png_uint_32 i;
    // Look for EXIF / XMP metadata.
    for (i = 0; i < num; ++i, ++text) {
      int j;
@ -188,24 +189,42 @@ static int ExtractMetadataFromPNG(png_structp png,
  return 1;
 }

-int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
-            Metadata* const metadata) {
-  volatile png_structp png;
+typedef struct {
+  const uint8_t* data;
+  size_t data_size;
+  png_size_t offset;
+} PNGReadContext;
+
+static void ReadFunc(png_structp png_ptr, png_bytep data, png_size_t length) {
+  PNGReadContext* const ctx = (PNGReadContext*)png_get_io_ptr(png_ptr);
+  if (ctx->data_size - ctx->offset < length) {
+    png_error(png_ptr, "ReadFunc: invalid read length (overflow)!");
+  }
+  memcpy(data, ctx->data + ctx->offset, length);
+  ctx->offset += length;
+}
+
+int ReadPNG(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic,
+            int keep_alpha, struct Metadata* const metadata) {
+  volatile png_structp png = NULL;
  volatile png_infop info = NULL;
  volatile png_infop end_info = NULL;
+  PNGReadContext context = { NULL, 0, 0 };
  int color_type, bit_depth, interlaced;
  int has_alpha;
  int num_passes;
  int p;
-  int ok = 0;
+  volatile int ok = 0;
  png_uint_32 width, height, y;
-  int stride;
+  png_uint_32 stride;
  uint8_t* volatile rgb = NULL;

+  context.data = data;
+  context.data_size = data_size;
+
  png = png_create_read_struct(PNG_LIBPNG_VER_STRING, 0, 0, 0);
-  if (png == NULL) {
-    goto End;
-  }
+  if (png == NULL) goto End;

  png_set_error_fn(png, 0, error_function, NULL);
  if (setjmp(png_jmpbuf(png))) {
@ -219,7 +238,7 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  end_info = png_create_info_struct(png);
  if (end_info == NULL) goto Error;

-  png_init_io(png, in_file);
+  png_set_read_fn(png, &context, ReadFunc);
  png_read_info(png, info);
  if (!png_get_IHDR(png, info,
                    &width, &height, &bit_depth, &color_type, &interlaced,
@ -268,11 +287,10 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
    goto Error;
  }

-  pic->width = width;
-  pic->height = height;
-  pic->use_argb = 1;
-  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
-                 : WebPPictureImportRGB(pic, rgb, stride);
+  pic->width = (int)width;
+  pic->height = (int)height;
+  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, (int)stride)
+                 : WebPPictureImportRGB(pic, rgb, (int)stride);

  if (!ok) {
    goto Error;
@ -287,9 +305,11 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
  return ok;
 }
 #else  // !WEBP_HAVE_PNG
-int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
-            struct Metadata* const metadata) {
-  (void)in_file;
+int ReadPNG(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic,
+            int keep_alpha, struct Metadata* const metadata) {
+  (void)data;
+  (void)data_size;
  (void)pic;
  (void)keep_alpha;
  (void)metadata;
--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@ -12,7 +12,7 @@
 #ifndef WEBP_EXAMPLES_PNGDEC_H_
 #define WEBP_EXAMPLES_PNGDEC_H_

-#include <stdio.h>
+#include "webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@ -21,12 +21,14 @@ extern "C" {
 struct Metadata;
 struct WebPPicture;

-// Reads a PNG from 'in_file', returning the decoded output in 'pic'.
+// Reads a PNG from 'data', returning the decoded output in 'pic'.
+// Output is RGBA or YUVA, depending on pic->use_argb value.
 // If 'keep_alpha' is true and the PNG has an alpha channel, the output is RGBA
-// otherwise it will be RGB.
+// or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
 // Returns true on success.
-int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
-            struct Metadata* const metadata);
+int ReadPNG(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic,
+            int keep_alpha, struct Metadata* const metadata);

 #ifdef __cplusplus
 }    // extern "C"
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@ -16,6 +16,7 @@
 #endif

 #include <stdio.h>
+#include <string.h>

 #ifdef WEBP_HAVE_TIFF
 #include <tiffio.h>
@ -63,17 +64,71 @@ static int ExtractMetadataFromTIFF(TIFF* const tif, Metadata* const metadata) {
  return 1;
 }

-int ReadTIFF(const char* const filename,
+// Ad-hoc structure to supply read-from-memory functionalities.
+typedef struct {
+  const uint8_t* data;
+  toff_t size;
+  toff_t pos;
+} MyData;
+
+static int MyClose(thandle_t opaque) {
+  (void)opaque;
+  return 0;
+}
+
+static toff_t MySize(thandle_t opaque) {
+  const MyData* const my_data = (MyData*)opaque;
+  return my_data->size;
+}
+
+static toff_t MySeek(thandle_t opaque, toff_t offset, int whence) {
+  MyData* const my_data = (MyData*)opaque;
+  offset += (whence == SEEK_CUR) ? my_data->pos
+          : (whence == SEEK_SET) ? 0
+          : my_data->size;
+  if (offset > my_data->size) return (toff_t)-1;
+  my_data->pos = offset;
+  return offset;
+}
+
+static int MyMapFile(thandle_t opaque, void** base, toff_t* size) {
+  (void)opaque;
+  (void)base;
+  (void)size;
+  return 0;
+}
+static void MyUnmapFile(thandle_t opaque, void* base, toff_t size) {
+  (void)opaque;
+  (void)base;
+  (void)size;
+}
+
+static tsize_t MyRead(thandle_t opaque, void* dst, tsize_t size) {
+  MyData* const my_data = (MyData*)opaque;
+  if (my_data->pos + size > my_data->size) {
+    size = my_data->size - my_data->pos;
+  }
+  if (size > 0) {
+    memcpy(dst, my_data->data + my_data->pos, size);
+    my_data->pos += size;
+  }
+  return size;
+}
+
+int ReadTIFF(const uint8_t* const data, size_t data_size,
             WebPPicture* const pic, int keep_alpha,
             Metadata* const metadata) {
-  TIFF* const tif = TIFFOpen(filename, "r");
+  MyData my_data = { data, (toff_t)data_size, 0 };
+  TIFF* const tif = TIFFClientOpen("Memory", "r", &my_data,
+                                   MyRead, MyRead, MySeek, MyClose,
+                                   MySize, MyMapFile, MyUnmapFile);
  uint32 width, height;
  uint32* raster;
  int ok = 0;
  tdir_t dircount;

  if (tif == NULL) {
-    fprintf(stderr, "Error! Cannot open TIFF file '%s'\n", filename);
+    fprintf(stderr, "Error! Cannot parse TIFF file\n");
    return 0;
  }

@ -87,7 +142,7 @@ int ReadTIFF(const char* const filename,
  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
-    return 0;
+    goto End;
  }
  raster = (uint32*)_TIFFmalloc(width * height * sizeof(*raster));
  if (raster != NULL) {
@ -100,7 +155,6 @@ int ReadTIFF(const char* const filename,
 #ifdef WORDS_BIGENDIAN
      TIFFSwabArrayOfLong(raster, width * height);
 #endif
-      pic->use_argb = 1;
      ok = keep_alpha
         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
@ -120,15 +174,16 @@ int ReadTIFF(const char* const filename,
      }
    }
  }
-
+ End:
  TIFFClose(tif);
  return ok;
 }
 #else  // !WEBP_HAVE_TIFF
-int ReadTIFF(const char* const filename,
+int ReadTIFF(const uint8_t* const data, size_t data_size,
             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata) {
-  (void)filename;
+  (void)data;
+  (void)data_size;
  (void)pic;
  (void)keep_alpha;
  (void)metadata;
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@ -12,6 +12,8 @@
 #ifndef WEBP_EXAMPLES_TIFFDEC_H_
 #define WEBP_EXAMPLES_TIFFDEC_H_

+#include "webp/types.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -19,11 +21,12 @@ extern "C" {
 struct Metadata;
 struct WebPPicture;

-// Reads a TIFF from 'filename', returning the decoded output in 'pic'.
+// Reads a TIFF from 'data', returning the decoded output in 'pic'.
+// Output is RGBA or YUVA, depending on pic->use_argb value.
 // If 'keep_alpha' is true and the TIFF has an alpha channel, the output is RGBA
-// otherwise it will be RGB.
+// or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
 // Returns true on success.
-int ReadTIFF(const char* const filename,
+int ReadTIFF(const uint8_t* const data, size_t data_size,
             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata);

--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -38,7 +38,7 @@

 #include "./example_util.h"

-#ifdef _MSC_VER
+#if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
 #endif

@ -308,19 +308,24 @@ static void HandleDisplay(void) {
    //              they will be incorrect if the window is resized.
    // glScissor() takes window coordinates (0,0 at bottom left).
    int window_x, window_y;
+    int frame_w, frame_h;
    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
      // Clear the previous frame rectangle.
      window_x = prev->x_offset;
      window_y = kParams.canvas_height - prev->y_offset - prev->height;
+      frame_w = prev->width;
+      frame_h = prev->height;
    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
      // We simulate no-blending behavior by first clearing the current frame
      // rectangle (to a checker-board) and then alpha-blending against it.
      window_x = curr->x_offset;
      window_y = kParams.canvas_height - curr->y_offset - curr->height;
+      frame_w = curr->width;
+      frame_h = curr->height;
    }
    glEnable(GL_SCISSOR_TEST);
    // Only update the requested area, not the whole canvas.
-    glScissor(window_x, window_y, prev->width, prev->height);
+    glScissor(window_x, window_y, frame_w, frame_h);

    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();
@ -382,17 +387,15 @@ static void Help(void) {
  printf("Usage: vwebp in_file [options]\n\n"
         "Decodes the WebP image file and visualize it using OpenGL\n"
         "Options are:\n"
-         "  -version  .... print version number and exit\n"
+         "  -version ..... print version number and exit\n"
         "  -noicc ....... don't use the icc profile if present\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
         "  -nofilter .... disable in-loop filtering\n"
         "  -dither <int>  dithering strength (0..100), default=50\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -noalphadither disable alpha plane dithering\n"
-#endif
         "  -mt .......... use multi-threading\n"
         "  -info ........ print info\n"
-         "  -h     ....... this help message\n"
+         "  -h ........... this help message\n"
         "\n"
         "Keyboard shortcuts:\n"
         "  'c' ................ toggle use of color profile\n"
@ -411,9 +414,7 @@ int main(int argc, char *argv[]) {
    return -1;
  }
  config->options.dithering_strength = 50;
-#if WEBP_DECODER_ABI_VERSION > 0x0204
  config->options.alpha_dithering_strength = 100;
-#endif
  kParams.use_color_profile = 1;

  for (c = 1; c < argc; ++c) {
@ -427,10 +428,8 @@ int main(int argc, char *argv[]) {
      config->options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
      config->options.bypass_filtering = 1;
-#if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-noalphadither")) {
      config->options.alpha_dithering_strength = 0;
-#endif
    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
      config->options.dithering_strength =
          ExUtilGetInt(argv[++c], 0, &parse_error);
@ -527,6 +526,12 @@ int main(int argc, char *argv[]) {
  WebPDemuxGetFrame(kParams.dmux, 0, curr);
  if (kParams.loop_count) ++kParams.loop_count;

+#if defined(__unix__) || defined(__CYGWIN__)
+  // Work around GLUT compositor bug.
+  // https://bugs.launchpad.net/ubuntu/+source/freeglut/+bug/369891
+  setenv("XLIB_SKIP_ARGB_VISUALS", "1", 1);
+#endif
+
  // Start display (and timer)
  glutInit(&argc, argv);
 #ifdef FREEGLUT
--- a/examples/webpdec.c
+++ b/examples/webpdec.c
@ -19,11 +19,10 @@
 #include "./example_util.h"
 #include "./metadata.h"

-int ReadWebP(const char* const in_file, WebPPicture* const pic,
+int ReadWebP(const uint8_t* const data, size_t data_size,
+             WebPPicture* const pic,
             int keep_alpha, Metadata* const metadata) {
  int ok = 0;
-  size_t data_size = 0;
-  const uint8_t* data = NULL;
  VP8StatusCode status = VP8_STATUS_OK;
  WebPDecoderConfig config;
  WebPDecBuffer* const output_buffer = &config.output;
@ -39,27 +38,56 @@ int ReadWebP(const char* const in_file, WebPPicture* const pic,
    return 0;
  }

-  if (ExUtilLoadWebP(in_file, &data, &data_size, bitstream)) {
+  status = WebPGetFeatures(data, data_size, bitstream);
+  if (status != VP8_STATUS_OK) {
+    ExUtilPrintWebPError("input data", status);
+    return 0;
+  }
+  {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
-    output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;
+    if (pic->use_argb) {
+      output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;
+    } else {
+      output_buffer->colorspace = has_alpha ? MODE_YUVA : MODE_YUV;
+    }

    status = ExUtilDecodeWebP(data, data_size, 0, &config);
    if (status == VP8_STATUS_OK) {
-      const uint8_t* const rgba = output_buffer->u.RGBA.rgba;
-      const int stride = output_buffer->u.RGBA.stride;
      pic->width = output_buffer->width;
      pic->height = output_buffer->height;
-      pic->use_argb = 1;
-      ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
-                     : WebPPictureImportRGB(pic, rgba, stride);
+      if (pic->use_argb) {
+        const uint8_t* const rgba = output_buffer->u.RGBA.rgba;
+        const int stride = output_buffer->u.RGBA.stride;
+        ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
+                       : WebPPictureImportRGB(pic, rgba, stride);
+      } else {
+        pic->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+        ok = WebPPictureAlloc(pic);
+        if (!ok) {
+          status = VP8_STATUS_OUT_OF_MEMORY;
+        } else {
+          const WebPYUVABuffer* const yuva = &output_buffer->u.YUVA;
+          const int uv_width = (pic->width + 1) >> 1;
+          const int uv_height = (pic->height + 1) >> 1;
+          ExUtilCopyPlane(yuva->y, yuva->y_stride,
+                          pic->y, pic->y_stride, pic->width, pic->height);
+          ExUtilCopyPlane(yuva->u, yuva->u_stride,
+                          pic->u, pic->uv_stride, uv_width, uv_height);
+          ExUtilCopyPlane(yuva->v, yuva->v_stride,
+                          pic->v, pic->uv_stride, uv_width, uv_height);
+          if (has_alpha) {
+            ExUtilCopyPlane(yuva->a, yuva->a_stride,
+                            pic->a, pic->a_stride, pic->width, pic->height);
+          }
+        }
+      }
    }
  }

  if (status != VP8_STATUS_OK) {
-    ExUtilPrintWebPError(in_file, status);
+    ExUtilPrintWebPError("input data", status);
  }

-  free((void*)data);
  WebPFreeDecBuffer(output_buffer);
  return ok;
 }
--- a/examples/webpdec.h
+++ b/examples/webpdec.h
@ -12,6 +12,8 @@
 #ifndef WEBP_EXAMPLES_WEBPDEC_H_
 #define WEBP_EXAMPLES_WEBPDEC_H_

+#include "webp/types.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -20,10 +22,12 @@ struct Metadata;
 struct WebPPicture;

 // Reads a WebP from 'in_file', returning the decoded output in 'pic'.
-// If 'keep_alpha' is true and the WebP has an alpha channel, the output is
-// RGBA otherwise it will be RGB.
+// Output is RGBA or YUVA, depending on pic->use_argb value.
+// If 'keep_alpha' is true and the WebP has an alpha channel, the output is RGBA
+// or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
 // Returns true on success.
-int ReadWebP(const char* const in_file, struct WebPPicture* const pic,
+int ReadWebP(const uint8_t* const data, size_t data_size,
+             struct WebPPicture* const pic,
             int keep_alpha, struct Metadata* const metadata);

 #ifdef __cplusplus
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -182,13 +182,11 @@ static WebPMuxError DisplayInfo(const WebPMux* mux) {
  printf("Canvas size: %d x %d\n", width, height);

  err = WebPMuxGetFeatures(mux, &flag);
-#ifndef WEBP_EXPERIMENTAL_FEATURES
  if (flag & FRAGMENTS_FLAG) err = WEBP_MUX_INVALID_ARGUMENT;
-#endif
  RETURN_IF_ERROR("Failed to retrieve features\n");

  if (flag == 0) {
-    fprintf(stderr, "No features present.\n");
+    printf("No features present.\n");
    return err;
  }

@ -291,9 +289,6 @@ static void PrintHelp(void) {
  printf("Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT\n");
  printf("       webpmux -set SET_OPTIONS INPUT -o OUTPUT\n");
  printf("       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("       webpmux -frgm FRAGMENT_OPTIONS [-frgm...] -o OUTPUT\n");
-#endif
  printf("       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]"
         "\n");
  printf("               [-bgcolor BACKGROUND_COLOR] -o OUTPUT\n");
@ -307,9 +302,6 @@ static void PrintHelp(void) {
  printf("   icc       get ICC profile\n");
  printf("   exif      get EXIF metadata\n");
  printf("   xmp       get XMP metadata\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("   frgm n    get nth fragment\n");
-#endif
  printf("   frame n   get nth frame\n");

  printf("\n");
@ -329,16 +321,6 @@ static void PrintHelp(void) {
  printf("   exif      strip EXIF metadata\n");
  printf("   xmp       strip XMP metadata\n");

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("\n");
-  printf("FRAGMENT_OPTIONS(i):\n");
-  printf(" Create fragmented image:\n");
-  printf("   file_i +xi+yi\n");
-  printf("   where:    'file_i' is the i'th fragment (WebP format),\n");
-  printf("             'xi','yi' specify the image offset for this fragment"
-         "\n");
-#endif
-
  printf("\n");
  printf("FRAME_OPTIONS(i):\n");
  printf(" Create animation:\n");
@ -650,24 +632,6 @@ static int ParseCommandLine(int argc, const char* argv[],
        arg->params_ = argv[i + 1];
        ++feature_arg_index;
        i += 2;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      } else if (!strcmp(argv[i], "-frgm")) {
-        CHECK_NUM_ARGS_LESS(3, ErrParse);
-        if (ACTION_IS_NIL || config->action_type_ == ACTION_SET) {
-          config->action_type_ = ACTION_SET;
-        } else {
-          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
-        }
-        if (FEATURETYPE_IS_NIL || feature->type_ == FEATURE_FRGM) {
-          feature->type_ = FEATURE_FRGM;
-        } else {
-          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
-        }
-        arg->filename_ = argv[i + 1];
-        arg->params_ = argv[i + 2];
-        ++feature_arg_index;
-        i += 3;
-#endif
      } else if (!strcmp(argv[i], "-o")) {
        CHECK_NUM_ARGS_LESS(2, ErrParse);
        config->output_ = argv[i + 1];
@ -727,13 +691,8 @@ static int ParseCommandLine(int argc, const char* argv[],
        } else {
          ++i;
        }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      } else if ((!strcmp(argv[i], "frame") ||
-                  !strcmp(argv[i], "frgm")) &&
-#else
      } else if (!strcmp(argv[i], "frame") &&
-#endif
-                  (config->action_type_ == ACTION_GET)) {
+                 (config->action_type_ == ACTION_GET)) {
        CHECK_NUM_ARGS_LESS(2, ErrParse);
        feature->type_ = (!strcmp(argv[i], "frame")) ? FEATURE_ANMF :
            FEATURE_FRGM;
@ -831,7 +790,7 @@ static int GetFrameFragment(const WebPMux* mux,
                            const WebPMuxConfig* config, int is_frame) {
  WebPMuxError err = WEBP_MUX_OK;
  WebPMux* mux_single = NULL;
-  long num = 0;
+  int num = 0;
  int ok = 1;
  int parse_error = 0;
  const WebPChunkId id = is_frame ? WEBP_CHUNK_ANMF : WEBP_CHUNK_FRGM;
@ -847,7 +806,7 @@ static int GetFrameFragment(const WebPMux* mux,
  err = WebPMuxGetFrame(mux, num, &info);
  if (err == WEBP_MUX_OK && info.id != id) err = WEBP_MUX_NOT_FOUND;
  if (err != WEBP_MUX_OK) {
-    ERROR_GOTO3("ERROR (%s): Could not get frame %ld.\n",
+    ERROR_GOTO3("ERROR (%s): Could not get frame %d.\n",
                ErrorString(err), num, ErrGet);
  }

--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -17,6 +17,7 @@

 #include <assert.h>
 #include <stdio.h>
+#include <string.h>

 #ifdef HAVE_WINCODEC_H
 #ifdef __MINGW32__
@ -26,11 +27,13 @@
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
                         // code with COBJMACROS.
+#include <ole2.h>  // CreateStreamOnHGlobal()
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>

 #include "webp/encode.h"
+#include "./example_util.h"
 #include "./metadata.h"

 #define IFS(fn)                                                     \
@ -73,10 +76,41 @@ WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
 WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
                 0xf5c7ad2d, 0x6a8d, 0x43dd,
                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppBGRA_,
+                 0x1562ff7c, 0xd352, 0x46f9,
+                 0x97, 0x9e, 0x42, 0x97, 0x6b, 0x79, 0x22, 0x46);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppRGBA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x16);

 static HRESULT OpenInputStream(const char* filename, IStream** stream) {
  HRESULT hr = S_OK;
-  IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  if (!strcmp(filename, "-")) {
+    const uint8_t* data = NULL;
+    size_t data_size = 0;
+    const int ok = ExUtilReadFile(filename, &data, &data_size);
+    if (ok) {
+      HGLOBAL image = GlobalAlloc(GMEM_MOVEABLE, data_size);
+      if (image != NULL) {
+        void* const image_mem = GlobalLock(image);
+        if (image_mem != NULL) {
+          memcpy(image_mem, data, data_size);
+          GlobalUnlock(image);
+          IFS(CreateStreamOnHGlobal(image, TRUE, stream));
+        } else {
+          hr = E_FAIL;
+        }
+      } else {
+        hr = E_OUTOFMEMORY;
+      }
+      free((void*)data);
+    } else {
+      hr = E_FAIL;
+    }
+  } else {
+    IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  }
+
  if (FAILED(hr)) {
    fprintf(stderr, "Error opening input file %s (%08lx)\n", filename, hr);
  }
@ -196,7 +230,11 @@ static int HasAlpha(IWICImagingFactory* const factory,
    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
                IsEqualGUID(MAKE_REFGUID(pixel_format),
-                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppBGRA_));
  }
  return has_alpha;
 }
@ -310,7 +348,7 @@ int ReadPictureWithWIC(const char* const filename,
    int ok;
    pic->width = width;
    pic->height = height;
-    pic->use_argb = 1;
+    pic->use_argb = 1;    // For WIC, we always force to argb
    ok = importer->import(pic, rgb, stride);
    if (!ok) hr = E_FAIL;
  }
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@ -21,7 +21,7 @@ struct WebPPicture;

 // Reads an image from 'filename', returning the decoded output in 'pic'.
 // If 'keep_alpha' is true and the image has an alpha channel, the output is
-// RGBA otherwise it will be RGB.
+// RGBA otherwise it will be RGB. pic->use_argb is always forced to true.
 // Returns true on success.
 int ReadPictureWithWIC(const char* const filename,
                       struct WebPPicture* const pic, int keep_alpha,
--- a/gradle.properties
+++ b/gradle.properties
@ -0,0 +1,14 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Versions for gradle
+BUILD_TOOLS_VERSION=23.0.3
+COMPILE_SDK_VERSION=23
+ANDROID_GRADLE_PLUGIN_VERSION=1.5.0
+GRADLE_DOWNLOAD_TASK_VERSION=2.1.0
--- a/gradle/wrapper/gradle-wrapper.jar
+++ b/gradle/wrapper/gradle-wrapper.jar
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@ -0,0 +1,6 @@
+#Thu May 12 17:06:25 CEST 2016
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-2.13-bin.zip
--- a/164
+++ b/164
@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
--- a/gradlew.bat
+++ b/gradlew.bat
@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -41,6 +41,8 @@ LIBLIST=''
 if [[ -z "${SDK}" ]]; then
  echo "iOS SDK not available"
  exit 1
+elif [[ ${SDK%%.*} -gt 8 ]]; then
+  EXTRA_CFLAGS="-fembed-bitcode"
 elif [[ ${SDK} < 6.0 ]]; then
  echo "You need iOS SDK version 6.0 or above"
  exit 1
@ -94,7 +96,7 @@ for PLATFORM in ${PLATFORMS}; do
  SDKROOT="${PLATFORMSROOT}/"
  SDKROOT+="${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
  CFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT} -O3 -DNDEBUG"
-  CFLAGS+=" -miphoneos-version-min=6.0"
+  CFLAGS+=" -miphoneos-version-min=6.0 ${EXTRA_CFLAGS}"

  set -x
  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
--- a/makefile.unix
+++ b/makefile.unix
@ -2,8 +2,8 @@
 # system, for simple local building of the libraries and tools.
 # It will not install the libraries system-wide, but just create the 'cwebp'
 # and 'dwebp' tools in the examples/ directory, along with the static
-# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a' and
-# 'src/demux/libwebpdemux.a'.
+# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a',
+# 'src/demux/libwebpdemux.a' and 'src/libwebpextras.a'.
 #
 # To build the library and examples, use:
 #    make -f makefile.unix
@ -61,6 +61,9 @@ endif
 EXTRA_FLAGS += -DWEBP_USE_THREAD
 EXTRA_LIBS += -lpthread

+# Control symbol visibility. Comment out if your compiler doesn't support it.
+EXTRA_FLAGS += -fvisibility=hidden
+
 # Extra flags to emulate C89 strictness with the full ANSI
 EXTRA_FLAGS += -Wextra -Wold-style-definition
 EXTRA_FLAGS += -Wmissing-prototypes
@ -68,9 +71,14 @@ EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
 EXTRA_FLAGS += -Wformat-security -Wformat-nonliteral
-
 # EXTRA_FLAGS += -Wvla

+# SSE4.1-specific flags:
+ifeq ($(HAVE_SSE41), 1)
+EXTRA_FLAGS += -DWEBP_HAVE_SSE41
+src/dsp/%_sse41.o: EXTRA_FLAGS += -msse4.1
+endif
+
 # AVX2-specific flags:
 ifeq ($(HAVE_AVX2), 1)
 EXTRA_FLAGS += -DWEBP_HAVE_AVX2
@ -81,18 +89,29 @@ endif
 # EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
 # -> seems to make the overall lib slower: -fno-split-wide-types

+# MIPS (MSA) 32-bit build specific flags for mips32r5 (p5600):
+# EXTRA_FLAGS += -mips32r5 -mabi=32 -mtune=p5600 -mmsa -mfp64
+# EXTRA_FLAGS += -msched-weight -mload-store-pairs
+
+# MIPS (MSA) 64-bit build specific flags for mips64r6 (i6400):
+# EXTRA_FLAGS += -mips64r6 -mabi=64 -mtune=i6400 -mmsa -mfp64
+# EXTRA_FLAGS += -msched-weight -mload-store-pairs
+
 #### Nothing should normally be changed below this line ####

 AR = ar
 ARFLAGS = r
-CC = gcc
 CPPFLAGS = -Isrc/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
+CC = gcc
 INSTALL = install
 GROFF = /usr/bin/groff
 COL = /usr/bin/col
 LDFLAGS = $(EXTRA_LIBS) $(EXTRA_FLAGS) -lm

+ANIM_UTIL_OBJS = \
+    examples/anim_util.o \
+
 DEC_OBJS = \
    src/dec/alpha.o \
    src/dec/buffer.o \
@ -106,34 +125,65 @@ DEC_OBJS = \
    src/dec/webp.o \

 DEMUX_OBJS = \
+    src/demux/anim_decode.o \
    src/demux/demux.o \

 DSP_DEC_OBJS = \
    src/dsp/alpha_processing.o \
+    src/dsp/alpha_processing_mips_dsp_r2.o \
    src/dsp/alpha_processing_sse2.o \
+    src/dsp/alpha_processing_sse41.o \
    src/dsp/cpu.o \
    src/dsp/dec.o \
    src/dsp/dec_clip_tables.o \
    src/dsp/dec_mips32.o \
+    src/dsp/dec_mips_dsp_r2.o \
+    src/dsp/dec_msa.o \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
+    src/dsp/dec_sse41.o \
+    src/dsp/filters.o \
+    src/dsp/filters_mips_dsp_r2.o \
+    src/dsp/filters_sse2.o \
    src/dsp/lossless.o \
-    src/dsp/lossless_mips32.o \
+    src/dsp/lossless_mips_dsp_r2.o \
    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
+    src/dsp/rescaler.o \
+    src/dsp/rescaler_mips32.o \
+    src/dsp/rescaler_mips_dsp_r2.o \
+    src/dsp/rescaler_neon.o \
+    src/dsp/rescaler_sse2.o \
    src/dsp/upsampling.o \
+    src/dsp/upsampling_mips_dsp_r2.o \
    src/dsp/upsampling_neon.o \
    src/dsp/upsampling_sse2.o \
    src/dsp/yuv.o \
    src/dsp/yuv_mips32.o \
+    src/dsp/yuv_mips_dsp_r2.o \
    src/dsp/yuv_sse2.o \

 DSP_ENC_OBJS = \
+    src/dsp/argb.o \
+    src/dsp/argb_mips_dsp_r2.o \
+    src/dsp/argb_sse2.o \
+    src/dsp/cost.o \
+    src/dsp/cost_mips32.o \
+    src/dsp/cost_mips_dsp_r2.o \
+    src/dsp/cost_sse2.o \
    src/dsp/enc.o \
    src/dsp/enc_avx2.o \
    src/dsp/enc_mips32.o \
+    src/dsp/enc_mips_dsp_r2.o \
    src/dsp/enc_neon.o \
    src/dsp/enc_sse2.o \
+    src/dsp/enc_sse41.o \
+    src/dsp/lossless_enc.o \
+    src/dsp/lossless_enc_mips32.o \
+    src/dsp/lossless_enc_mips_dsp_r2.o \
+    src/dsp/lossless_enc_neon.o \
+    src/dsp/lossless_enc_sse2.o \
+    src/dsp/lossless_enc_sse41.o \

 ENC_OBJS = \
    src/enc/alpha.o \
@ -141,10 +191,12 @@ ENC_OBJS = \
    src/enc/backward_references.o \
    src/enc/config.o \
    src/enc/cost.o \
+    src/enc/delta_palettization.o \
    src/enc/filter.o \
    src/enc/frame.o \
    src/enc/histogram.o \
    src/enc/iterator.o \
+    src/enc/near_lossless.o \
    src/enc/picture.o \
    src/enc/picture_csp.o \
    src/enc/picture_psnr.o \
@ -158,6 +210,7 @@ ENC_OBJS = \
    src/enc/webpenc.o \

 EX_FORMAT_DEC_OBJS = \
+    examples/image_dec.o \
    examples/jpegdec.o \
    examples/metadata.o \
    examples/pngdec.o \
@ -167,10 +220,11 @@ EX_FORMAT_DEC_OBJS = \
 EX_UTIL_OBJS = \
    examples/example_util.o \

-GIF2WEBP_UTIL_OBJS = \
-    examples/gif2webp_util.o \
+GIFDEC_OBJS = \
+    examples/gifdec.o \

 MUX_OBJS = \
+    src/mux/anim_encode.o \
    src/mux/muxedit.o \
    src/mux/muxinternal.o \
    src/mux/muxread.o \
@ -191,11 +245,15 @@ UTILS_ENC_OBJS = \
    src/utils/huffman_encode.o \
    src/utils/quant_levels.o \

+EXTRA_OBJS = \
+    src/extras/extras.o \
+
 LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
 LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
               $(UTILS_ENC_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS)
 LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)
+LIBWEBPEXTRA_OBJS = $(EXTRA_OBJS)

 HDRS_INSTALLED = \
    src/webp/decode.h \
@ -207,24 +265,30 @@ HDRS_INSTALLED = \

 HDRS = \
    src/dec/alphai.h \
+    src/dec/common.h \
    src/dec/decode_vp8.h \
    src/dec/vp8i.h \
    src/dec/vp8li.h \
    src/dec/webpi.h \
+    src/dsp/common_sse2.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
+    src/dsp/mips_macro.h \
+    src/dsp/msa_macro.h \
    src/dsp/neon.h \
    src/dsp/yuv.h \
-    src/dsp/yuv_tables_sse2.h \
    src/enc/backward_references.h \
    src/enc/cost.h \
+    src/enc/delta_palettization.h \
    src/enc/histogram.h \
    src/enc/vp8enci.h \
    src/enc/vp8li.h \
    src/mux/muxi.h \
    src/utils/bit_reader.h \
+    src/utils/bit_reader_inl.h \
    src/utils/bit_writer.h \
    src/utils/color_cache.h \
+    src/utils/endian_inl.h \
    src/utils/filters.h \
    src/utils/huffman.h \
    src/utils/huffman_encode.h \
@ -238,18 +302,21 @@ HDRS = \
    $(HDRS_INSTALLED) \

 OUT_LIBS = examples/libexample_util.a src/libwebpdecoder.a src/libwebp.a
+EXTRA_LIB = src/libwebpextras.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
-EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux
+EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux \
+                 examples/anim_diff

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
  OUTPUT += $(EXTRA_EXAMPLES)
-  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
-  OUTPUT += examples/libgif2webp_util.a
+  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a $(EXTRA_LIB)
+  OUTPUT += examples/libgifdec.a examples/libanim_util.a
 endif

 ex: $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
+extras: $(EXTRA_LIB)

 $(EX_FORMAT_DEC_OBJS): %.o: %.h

@ -264,27 +331,37 @@ src/utils/bit_writer.o: src/utils/endian_inl.h
 %.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@

+examples/libanim_util.a: $(ANIM_UTIL_OBJS)
+examples/libexample_dec.a: $(EX_FORMAT_DEC_OBJS)
 examples/libexample_util.a: $(EX_UTIL_OBJS)
-examples/libgif2webp_util.a: $(GIF2WEBP_UTIL_OBJS)
+examples/libgifdec.a: $(GIFDEC_OBJS)
 src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
 src/libwebp.a: $(LIBWEBP_OBJS)
 src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
 src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)
+src/libwebpextras.a: $(LIBWEBPEXTRA_OBJS)

 %.a:
 	$(AR) $(ARFLAGS) $@ $^

-examples/cwebp: examples/cwebp.o $(EX_FORMAT_DEC_OBJS)
+examples/anim_diff: examples/anim_diff.o $(ANIM_UTIL_OBJS) $(GIFDEC_OBJS)
+examples/cwebp: examples/cwebp.o
 examples/dwebp: examples/dwebp.o
-examples/gif2webp: examples/gif2webp.o
+examples/gif2webp: examples/gif2webp.o $(GIFDEC_OBJS)
 examples/vwebp: examples/vwebp.o
 examples/webpmux: examples/webpmux.o

-examples/cwebp: examples/libexample_util.a src/libwebp.a
+examples/anim_diff: examples/libanim_util.a examples/libgifdec.a
+examples/anim_diff: src/demux/libwebpdemux.a examples/libexample_util.a
+examples/anim_diff: src/libwebp.a
+examples/anim_diff: EXTRA_LIBS += $(GIF_LIBS)
+examples/anim_diff: EXTRA_FLAGS += -DWEBP_HAVE_GIF
+examples/cwebp: examples/libexample_util.a examples/libexample_dec.a
+examples/cwebp: src/libwebp.a
 examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
 examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
 examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
-examples/gif2webp: examples/libexample_util.a examples/libgif2webp_util.a
+examples/gif2webp: examples/libexample_util.a examples/libgifdec.a
 examples/gif2webp: src/mux/libwebpmux.a src/libwebp.a
 examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
 examples/gif2webp: EXTRA_FLAGS += -DWEBP_HAVE_GIF
@ -324,23 +401,10 @@ clean:
              src/demux/*.o src/demux/*~ \
              src/dsp/*.o src/dsp/*~ \
              src/enc/*.o src/enc/*~ \
+              src/extras/*.o src/extras/*~ \
              src/mux/*.o src/mux/*~ \
              src/utils/*.o src/utils/*~ \
              src/webp/*~ man/*~ doc/*~ swig/*~ \

-superclean: clean
-	$(RM) -r .git *.log *.cache *~
-	$(RM) -r .deps */.deps */*/.deps
-	$(RM) -r .libs */.libs */*/.libs
-	$(RM) */*.lo */*/*.lo
-	$(RM) */*.la */*/*.la
-	$(RM) Makefile */Makefile */*/Makefile
-	$(RM) Makefile.in */Makefile.in */*/Makefile.in
-	$(RM) config.log autom4te.cache libtool config.h stamp-h1
-	$(RM) aclocal.m4 compile
-	$(RM) config.guess config.h.in config.sub config.status
-	$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
-	$(RM) m4/*
-
-.PHONY: all clean dist ex superclean
+.PHONY: all clean dist ex
 .SUFFIXES:
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "Oct 13, 2014"
+.TH CWEBP 1 "June 23, 2016"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -23,7 +23,7 @@ Using "\-" as output name will direct output to 'stdout'.
 .TP
 .BI \-\- " string
 Explicitly specify the input file. This option is useful if the input
-file starts with an '\-' for instance. This option must appear \fBlast\fP.
+file starts with a '\-' for instance. This option must appear \fBlast\fP.
 Any other options afterward will be ignored.
 .TP
 .B \-h, \-help
@ -35,6 +35,17 @@ A summary of all the possible options.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
+.B \-lossless
+Encode the image without any loss. For images with fully transparent area,
+the invisible pixel values (R/G/B or Y/U/V) will be preserved only if the
+\-exact option is used.
+.TP
+.BI \-near_lossless " int
+Use near\-lossless image preprocessing. This option adjusts pixel values
+to help compressibility, but has minimal impact on the visual quality.
+It triggers lossless compression mode automatically.
+Range is 0 (maximum preprocessing) to 100 (no preprocessing, the default).
+.TP
 .BI \-q " float
 Specify the compression factor for RGB channels between 0 and 100. The default
 is 75.
@ -42,46 +53,31 @@ is 75.
 In case of lossy compression (default), a small factor produces a smaller file
 with lower quality. Best quality is achieved by using a value of 100.
 .br
-In case of lossless compression (specified by the \-lossless option), a small
-factor enables faster compression speed, but produces a larger file. Maximum
-compression is achieved by using a value of 100.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .BI \-z " int
-.\" Switch on \fBlossless\fP compression mode with the specified level between 0
-.\" and 9, with level 0 being the fastest, 9 being the slowest. Fast mode
-.\" produces larger file size than slower ones. A good default is \-z 6.
-.\" This option is actually a shortcut for some predefined settings for quality
-.\" and method. If options \-q  or \-m are subsequently used, they will invalidate
-.\" the effect of this \-z option.
+In case of lossless compression (specified by the \fB\-lossless\fP option), a
+small factor enables faster compression speed, but produces a larger file.
+Maximum compression is achieved by using a value of 100.
+.TP
+.BI \-z " int
+Switch on \fBlossless\fP compression mode with the specified level between 0
+and 9, with level 0 being the fastest, 9 being the slowest. Fast mode
+produces larger file size than slower ones. A good default is \fB\-z 6\fP.
+This option is actually a shortcut for some predefined settings for quality
+and method. If options \fB\-q\fP  or \fB\-m\fP are subsequently used, they will
+invalidate the effect of this option.
 .TP
 .BI \-alpha_q " int
 Specify the compression factor for alpha compression between 0 and 100.
 Lossless compression of alpha is achieved using a value of 100, while the lower
 values result in a lossy compression. The default is 100.
 .TP
-.BI \-f " int
-Specify the strength of the deblocking filter, between 0 (no filtering)
-and 100 (maximum filtering). A value of 0 will turn off any filtering.
-Higher value will increase the strength of the filtering process applied
-after decoding the picture. The higher the value the smoother the picture will
-appear. Typical values are usually in the range of 20 to 50.
-.TP
 .BI \-preset " string
-Specify a set of pre-defined parameters to suit a particular type of
+Specify a set of pre\-defined parameters to suit a particular type of
 source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
 \fB\-preset\fP overwrites the other parameters' values (except the
 \fB\-q\fP one), this option should preferably appear first in the
 order of the arguments.
 .TP
-.BI \-sns " int
-Specify the amplitude of the spatial noise shaping. Spatial noise shaping
-(or \fBsns\fP for short) refers to a general collection of built-in algorithms
-used to decide which area of the picture should use relatively less bits,
-and where else to better transfer these bits. The possible range goes from
-0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
-.TP
 .BI \-m " int
 Specify the compression method to use. This parameter controls the
 trade off between encoding speed and the compressed file size and quality.
@ -91,14 +87,18 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result in faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
-.B \-jpeg_like
-Change the internal parameter mapping to better match the expected size
-of JPEG compression. This flag will generally produce an output file of
-similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
-with less visual distortion.
+.BI \-resize " width height
+Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect\-ratio.
+.TP
+.BI \-crop " x_position y_position width height
+Crop the source to a rectangle with top\-left corner at coordinates
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
 .TP
 .B \-mt
-Use multi-threading for encoding, if possible. This option is only effective
+Use multi\-threading for encoding, if possible. This option is only effective
 when using lossy compression on a source with a transparency channel.
 .TP
 .B \-low_memory
@ -109,13 +109,50 @@ different in size and distortion. This flag is only effective for methods
 some side effects on the bitstream: it forces certain bitstream features
 like number of partitions (forced to 1). Note that a more detailed report
 of bitstream size is printed by \fBcwebp\fP when using this option.
+
+.SS LOSSY OPTIONS
+These options are only effective when doing lossy encoding (the default, with
+or without alpha).
+
+.TP
+.BI \-size " int
+Specify a target size (in bytes) to try and reach for the compressed output.
+Compressor will make several pass of partial encoding in order to get as
+close as possible to this target. If both \fB\-size\fP and \fB\-psnr\fP
+as used, \fB\-size\fP value will prevail.
+.TP
+.BI \-psnr " float
+Specify a target PSNR (in dB) to try and reach for the compressed output.
+Compressor will make several pass of partial encoding in order to get as
+close as possible to this target. If both \fB\-size\fP and \fB\-psnr\fP
+as used, \fB\-size\fP value will prevail.
+.TP
+.BI \-pass " int
+Set a maximum number of passes to use during the dichotomy used by
+options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10, default is 1.
+If options \fB\-size\fP or \fB\-psnr\fP were used, but \fB\-pass\fP wasn't
+specified, a default value of '6' passes will be used.
 .TP
 .B \-af
-Turns auto-filter on. This algorithm will spend additional time optimizing
-the filtering strength to reach a well-balanced quality.
+Turns auto\-filter on. This algorithm will spend additional time optimizing
+the filtering strength to reach a well\-balanced quality.
+.TP
+.B \-jpeg_like
+Change the internal parameter mapping to better match the expected size
+of JPEG compression. This flag will generally produce an output file of
+similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
+with less visual distortion.

-.SH ADDITIONAL OPTIONS
-More advanced options are:
+.TP
+Advanced options:
+
+.TP
+.BI \-f " int
+Specify the strength of the deblocking filter, between 0 (no filtering)
+and 100 (maximum filtering). A value of 0 will turn off any filtering.
+Higher value will increase the strength of the filtering process applied
+after decoding the picture. The higher the value the smoother the picture will
+appear. Typical values are usually in the range of 20 to 50.
 .TP
 .BI \-sharpness " int
 Specify the sharpness of the filtering (if used).
@ -129,6 +166,13 @@ Use strong filtering (if filtering is being used thanks to the
 Disable strong filtering (if filtering is being used thanks to the
 \fB\-f\fP option) and use simple filtering instead.
 .TP
+.BI \-sns " int
+Specify the amplitude of the spatial noise shaping. Spatial noise shaping
+(or \fBsns\fP for short) refers to a general collection of built\-in algorithms
+used to decide which area of the picture should use relatively less bits,
+and where else to better transfer these bits. The possible range goes from
+0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
+.TP
 .BI \-segments " int
 Change the number of partitions to use during the segmentation of the
 sns algorithm. Segments should be in range 1 to 4. Default value is 4.
@ -138,107 +182,30 @@ is used.
 .BI \-partition_limit " int
 Degrade quality by limiting the number of bits used by some macroblocks.
 Range is 0 (no degradation, the default) to 100 (full degradation).
-Useful values are usually around 30-70 for moderately large images.
-In the VP8 format, the so-called control partition has a limit of 512k and
+Useful values are usually around 30\-70 for moderately large images.
+In the VP8 format, the so\-called control partition has a limit of 512k and
 is used to store the following information: whether the macroblock is skipped,
 which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
-mode, and finally the prediction modes to use for each of the sub-blocks.
+mode, and finally the prediction modes to use for each of the sub\-blocks.
 For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
 The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
 information can use up almost all these 4 bits (although the case is unlikely),
 which is problematic for very large images. The partition_limit factor controls
-how frequently the most bit-costly mode (intra 4x4) will be used. This is
+how frequently the most bit\-costly mode (intra 4x4) will be used. This is
 useful in case the 512k limit is reached and the following message is displayed:
 \fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
-If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
+If using \fB\-partition_limit\fP is not enough to meet the 512k constraint, one
 should use less segments in order to save more header bits per macroblock.
-See the \fB-segments\fP option.
-.TP
-.BI \-size " int
-Specify a target size (in bytes) to try and reach for the compressed output.
-Compressor will make several pass of partial encoding in order to get as
-close as possible to this target.
-.TP
-.BI \-psnr " float
-Specify a target PSNR (in dB) to try and reach for the compressed output.
-Compressor will make several pass of partial encoding in order to get as
-close as possible to this target.
-.TP
-.BI \-pass " int
-Set a maximum number of passes to use during the dichotomy used by
-options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
-.TP
-.BI \-resize " width height
-Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
-If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
-the value will be calculated preserving the aspect-ratio.
-.TP
-.BI \-crop " x_position y_position width height
-Crop the source to a rectangle with top-left corner at coordinates
-(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
-This cropping area must be fully contained within the source rectangle.
-.TP
-.BI \-s " width height
-Specify that the input file actually consists of raw Y'CbCr samples following
-the ITU-R BT.601 recommendation, in 4:2:0 linear format.
-The luma plane has size \fBwidth\fP x \fBheight\fP.
-.TP
-.BI \-map " int
-Output additional ASCII-map of encoding information. Possible map values
-range from 1 to 6. This is only meant to help debugging.
-.TP
-.BI \-pre " int
-Specify some pre-processing steps. Using a value of '2' will trigger
-quality-dependent pseudo-random dithering during RGBA->YUVA conversion
-(lossy compression only).
-.TP
-.BI \-alpha_filter " string
-Specify the predictive filtering method for the alpha plane. One of 'none',
-\&'fast' or 'best', in increasing complexity and slowness order. Default is
-\&'fast'. Internally, alpha filtering is performed using four possible
-predictions (none, horizontal, vertical, gradient). The 'best' mode will try
-each mode in turn and pick the one which gives the smaller size. The 'fast'
-mode will just try to form an a-priori guess without testing all modes.
-.TP
-.BI \-alpha_method " int
-Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
-no compression, 1 uses WebP lossless format for compression. The default is 1.
-.TP
-.B \-alpha_cleanup
-Modify unseen RGB values under fully transparent area, to help compressibility.
-The default is off.
-.TP
-.BI \-blend_alpha " int
-This option blends the alpha channel (if present) with the source using the
-background color specified in hexadecimal as 0xrrggbb. The alpha channel is
-afterward reset to the opaque value 255.
-.TP
-.B \-noalpha
-Using this option will discard the alpha channel.
-.TP
-.B \-lossless
-Encode the image without any loss.
-.TP
-.BI \-hint " string
-Specify the hint about input image type. Possible values are:
-\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
-.TP
-.BI \-metadata " string
-A comma separated list of metadata to copy from the input to the output if
-present.
-Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
-The default is \fBnone\fP.
+See the \fB\-segments\fP option.

-Note: each input format may not support all combinations.
-.TP
-.B \-noasm
-Disable all assembly optimizations.
+.SS LOGGING OPTIONS
+These options control the level of output:
 .TP
 .B \-v
 Print extra information (encoding time in particular).
 .TP
 .B \-print_psnr
-Compute and report average PSNR (Peak-Signal-To-Noise ratio).
+Compute and report average PSNR (Peak\-Signal\-To\-Noise ratio).
 .TP
 .B \-print_ssim
 Compute and report average SSIM (structural similarity
@ -256,13 +223,69 @@ Do not print anything.
 .TP
 .B \-short
 Only print brief information (output file size and PSNR) for testing purpose.
+.TP
+.BI \-map " int
+Output additional ASCII\-map of encoding information. Possible map values
+range from 1 to 6. This is only meant to help debugging.
+
+.SS ADDITIONAL OPTIONS
+More advanced options are:
+.TP
+.BI \-s " width height
+Specify that the input file actually consists of raw Y'CbCr samples following
+the ITU\-R BT.601 recommendation, in 4:2:0 linear format.
+The luma plane has size \fBwidth\fP x \fBheight\fP.
+.TP
+.BI \-pre " int
+Specify some preprocessing steps. Using a value of '2' will trigger
+quality\-dependent pseudo\-random dithering during RGBA\->YUVA conversion
+(lossy compression only).
+.TP
+.BI \-alpha_filter " string
+Specify the predictive filtering method for the alpha plane. One of 'none',
+\&'fast' or 'best', in increasing complexity and slowness order. Default is
+\&'fast'. Internally, alpha filtering is performed using four possible
+predictions (none, horizontal, vertical, gradient). The 'best' mode will try
+each mode in turn and pick the one which gives the smaller size. The 'fast'
+mode will just try to form an a priori guess without testing all modes.
+.TP
+.BI \-alpha_method " int
+Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
+no compression, 1 uses WebP lossless format for compression. The default is 1.
+.TP
+.B \-exact
+Preserve RGB values in transparent area. The default is off, to help
+compressibility.
+.TP
+.BI \-blend_alpha " int
+This option blends the alpha channel (if present) with the source using the
+background color specified in hexadecimal as 0xrrggbb. The alpha channel is
+afterward reset to the opaque value 255.
+.TP
+.B \-noalpha
+Using this option will discard the alpha channel.
+.TP
+.BI \-hint " string
+Specify the hint about input image type. Possible values are:
+\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBnone\fP.
+
+Note: each input format may not support all combinations.
+.TP
+.B \-noasm
+Disable all assembly optimizations.

 .SH BUGS
-Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
-http://www.webmproject.org/code/contribute/submitting-patches/
+http://www.webmproject.org/code/contribute/submitting\-patches/

 .SH EXAMPLES
 cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
@ -274,9 +297,10 @@ cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
 cwebp \-o picture.webp \-\- \-\-\-picture.png

 .SH AUTHORS
-\fBcwebp\fP was written by the WebP team.
+\fBcwebp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "July 22, 2014"
+.TH DWEBP 1 "June 23, 2016"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -67,12 +67,11 @@ but it will make the decoding faster.
 Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .BI \-alpha_dither
-.\" If the compressed file contains a transparency plane that was quantized
-.\" during compression, this flag will allow dithering the reconstructed plane
-.\" in order to generate smoother transparency gradients.
+.TP
+.BI \-alpha_dither
+If the compressed file contains a transparency plane that was quantized
+during compression, this flag will allow dithering the reconstructed plane
+in order to generate smoother transparency gradients.
 .TP
 .B \-nodither
 Disable all dithering (default).
@ -87,16 +86,20 @@ This cropping area must be fully contained within the source rectangle.
 The top-left corner will be snapped to even coordinates if needed.
 This option is meant to reduce the memory needed for cropping large images.
 Note: the cropping is applied \fIbefore\fP any scaling.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .B \-flip
-.\" Flip decoded image vertically (can be useful for OpenGL textures for instance).
 .TP
-.BI \-scale " width height
+.B \-flip
+Flip decoded image vertically (can be useful for OpenGL textures for instance).
+.TP
+\fB\-resize\fR, \fB\-scale\fI width height\fR
 Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This
 option is mostly intended to reducing the memory needed to decode large images,
-when only a small version is needed (thumbnail, preview, etc.).  Note: scaling
+when only a small version is needed (thumbnail, preview, etc.). Note: scaling
 is applied \fIafter\fP cropping.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect-ratio.
+.TP
+.B \-quiet
+Do not print anything.
 .TP
 .B \-v
 Print extra information (decoding time in particular).
@ -105,8 +108,8 @@ Print extra information (decoding time in particular).
 Disable all assembly optimizations.

 .SH BUGS
-Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
@ -121,9 +124,10 @@ dwebp \-o output.ppm \-\- \-\-\-picture.webp
 cat picture.webp | dwebp \-o \- \-\- \- > output.ppm

 .SH AUTHORS
-\fBdwebp\fP was written by the WebP team.
+\fBdwebp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "March 7, 2014"
+.TH GIF2WEBP 1 "June 23, 2016"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -54,6 +54,12 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result is faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
+.BI \-min_size
+Encode image to achieve smallest size. This disables key frame insertion and
+picks the dispose method resulting in smallest output for each frame. It uses
+lossless compression by default, but can be combined with \-q, \-m, \-lossy or
+\-mixed options.
+.TP
 .BI \-kmin " int
 .TP
 .BI \-kmax " int
@ -62,7 +68,8 @@ Specify the minimum and maximum distance between consecutive key frames
 some key frames into the output animation as needed so that this criteria is
 satisfied.
 .br
-A 'kmin' value of 0 will turn off insertion of key frames.
+A 'kmin' value of 0 will turn off insertion of key frames. A 'kmax' value of 0
+will result in all frames being key frames.
 Typical values are in the range 3 to 30. Default values are kmin = 9,
 kmax = 17 for lossless compression and kmin = 3, kmax = 5 for lossy compression.
 .br
@ -110,8 +117,8 @@ Print extra information.
 Do not print anything.

 .SH BUGS
-Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
@ -128,9 +135,10 @@ gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
 gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif

 .SH AUTHORS
-\fBgif2webp\fP was written by the WebP team.
+\fBgif2webp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Urvang Joshi <urvang@google.com>, for the
 Debian project (and may be used by others).
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH VWEBP 1 "July 23, 2014"
+.TH VWEBP 1 "June 23, 2016"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
@ -33,11 +33,10 @@ Disable in-loop filtering.
 Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts. Default: 50.
-.\" TODO(jzern): restore post-v0.4.1
-.\" .TP
-.\" .BI \-noalphadither
-.\" By default, quantized transparency planes are dithered during decompression,
-.\" to smooth the gradients. This flag will prevent this dithering.
+.TP
+.BI \-noalphadither
+By default, quantized transparency planes are dithered during decompression,
+to smooth the gradients. This flag will prevent this dithering.
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
@ -64,8 +63,8 @@ Overlay file information.
 Quit.

 .SH BUGS
-Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
@ -78,9 +77,10 @@ vwebp picture.webp -mt -dither 0
 vwebp \-\- \-\-\-picture.webp

 .SH AUTHORS
-\fBvwebp\fP was written by the WebP team.
+\fBvwebp\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written for the Debian project (and may be used by others).

--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "August 28, 2014"
+.TH WEBPMUX 1 "June 23, 2016"
 .SH NAME
 webpmux \- create animated WebP files from non\-animated WebP images, extract
 frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
@ -128,8 +128,8 @@ Output file in WebP format.
 The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.

 .SH BUGS
-Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+Please report all bugs to the issue tracker:
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting\-patches/
@ -195,9 +195,10 @@ webpmux \-get icc \-o image_profile.icc \-\- \-\-\-icc_container.webp
 webpmux \-strip icc \-o without_icc.webp \-\- \-\-\-icc_container.webp

 .SH AUTHORS
-\fBwebpmux\fP is written by the WebP team.
+\fBwebpmux\fP is a part of libwebp and was written by the WebP team.
 .br
-The latest source tree is available at http://www.webmproject.org/code
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
 for the Debian project (and may be used by others).
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -1,5 +1,5 @@
-# The mux and demux libraries depend on libwebp, thus the '.' to force the
-# build order so it's available to them.
+# The mux, demux and extras libraries depend on libwebp, thus the '.' to force
+# the build order so it's available to them.
 SUBDIRS = dec enc dsp utils .
 if WANT_MUX
  SUBDIRS += mux
@ -7,6 +7,9 @@ endif
 if WANT_DEMUX
  SUBDIRS += demux
 endif
+if WANT_EXTRAS
+  SUBDIRS += extras
+endif

 lib_LTLIBRARIES = libwebp.la

@ -35,7 +38,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 5:3:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 6:1:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -47,7 +50,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:3:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 2:1:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -4,6 +4,7 @@ libwebpdecode_la_SOURCES =
 libwebpdecode_la_SOURCES += alpha.c
 libwebpdecode_la_SOURCES += alphai.h
 libwebpdecode_la_SOURCES += buffer.c
+libwebpdecode_la_SOURCES += common.h
 libwebpdecode_la_SOURCES += decode_vp8.h
 libwebpdecode_la_SOURCES += frame.c
 libwebpdecode_la_SOURCES += idec.c
@ -23,5 +24,5 @@ libwebpdecodeinclude_HEADERS += ../webp/types.h
 noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

-libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebpdecode_la_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 libwebpdecodeincludedir = $(includedir)/webp
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@ -15,6 +15,7 @@
 #include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
+#include "../dsp/dsp.h"
 #include "../utils/quant_levels_dec.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
@ -22,12 +23,14 @@
 //------------------------------------------------------------------------------
 // ALPHDecoder object.

-ALPHDecoder* ALPHNew(void) {
+// Allocates a new alpha decoder instance.
+static ALPHDecoder* ALPHNew(void) {
  ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
  return dec;
 }

-void ALPHDelete(ALPHDecoder* const dec) {
+// Clears and deallocates an alpha decoder instance.
+static void ALPHDelete(ALPHDecoder* const dec) {
  if (dec != NULL) {
    VP8LDelete(dec->vp8l_dec_);
    dec->vp8l_dec_ = NULL;
@ -43,17 +46,21 @@ void ALPHDelete(ALPHDecoder* const dec) {
 // Returns false in case of error in alpha header (data too short, invalid
 // compression method or filter, error in lossless header data etc).
 static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
-                    size_t data_size, int width, int height, uint8_t* output) {
+                    size_t data_size, const VP8Io* const src_io,
+                    uint8_t* output) {
  int ok = 0;
  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
  int rsrv;
+  VP8Io* const io = &dec->io_;

-  assert(width > 0 && height > 0);
-  assert(data != NULL && output != NULL);
+  assert(data != NULL && output != NULL && src_io != NULL);

-  dec->width_ = width;
-  dec->height_ = height;
+  VP8FiltersInit();
+  dec->output_ = output;
+  dec->width_ = src_io->width;
+  dec->height_ = src_io->height;
+  assert(dec->width_ > 0 && dec->height_ > 0);

  if (data_size <= ALPHA_HEADER_LEN) {
    return 0;
@ -71,13 +78,28 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
    return 0;
  }

+  // Copy the necessary parameters from src_io to io
+  VP8InitIo(io);
+  WebPInitCustomIo(NULL, io);
+  io->opaque = dec;
+  io->width = src_io->width;
+  io->height = src_io->height;
+
+  io->use_cropping = src_io->use_cropping;
+  io->crop_left = src_io->crop_left;
+  io->crop_right = src_io->crop_right;
+  io->crop_top = src_io->crop_top;
+  io->crop_bottom = src_io->crop_bottom;
+  // No need to copy the scaling parameters.
+
  if (dec->method_ == ALPHA_NO_COMPRESSION) {
    const size_t alpha_decoded_size = dec->width_ * dec->height_;
    ok = (alpha_data_size >= alpha_decoded_size);
  } else {
    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
-    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size);
  }
+
  return ok;
 }

@ -88,15 +110,30 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
 static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
  ALPHDecoder* const alph_dec = dec->alph_dec_;
  const int width = alph_dec->width_;
-  const int height = alph_dec->height_;
-  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
-  uint8_t* const output = dec->alpha_plane_;
+  const int height = alph_dec->io_.crop_bottom;
  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
-    const size_t offset = row * width;
-    const size_t num_pixels = num_rows * width;
-    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
-    memcpy(dec->alpha_plane_ + offset,
-           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
+    int y;
+    const uint8_t* prev_line = dec->alpha_prev_line_;
+    const uint8_t* deltas = dec->alpha_data_ + ALPHA_HEADER_LEN + row * width;
+    uint8_t* dst = dec->alpha_plane_ + row * width;
+    assert(deltas <= &dec->alpha_data_[dec->alpha_data_size_]);
+    if (alph_dec->filter_ != WEBP_FILTER_NONE) {
+      assert(WebPUnfilters[alph_dec->filter_] != NULL);
+      for (y = 0; y < num_rows; ++y) {
+        WebPUnfilters[alph_dec->filter_](prev_line, deltas, dst, width);
+        prev_line = dst;
+        dst += width;
+        deltas += width;
+      }
+    } else {
+      for (y = 0; y < num_rows; ++y) {
+        memcpy(dst, deltas, width * sizeof(*dst));
+        prev_line = dst;
+        dst += width;
+        deltas += width;
+      }
+    }
+    dec->alpha_prev_line_ = prev_line;
  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
    assert(alph_dec->vp8l_dec_ != NULL);
    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
@ -104,62 +141,92 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
    }
  }

-  if (unfilter_func != NULL) {
-    unfilter_func(width, height, width, row, num_rows, output);
-  }
-
-  if (row + num_rows == dec->pic_hdr_.height_) {
+  if (row + num_rows >= height) {
    dec->is_alpha_decoded_ = 1;
  }
  return 1;
 }

+static int AllocateAlphaPlane(VP8Decoder* const dec, const VP8Io* const io) {
+  const int stride = io->width;
+  const int height = io->crop_bottom;
+  const uint64_t alpha_size = (uint64_t)stride * height;
+  assert(dec->alpha_plane_mem_ == NULL);
+  dec->alpha_plane_mem_ =
+      (uint8_t*)WebPSafeMalloc(alpha_size, sizeof(*dec->alpha_plane_));
+  if (dec->alpha_plane_mem_ == NULL) {
+    return 0;
+  }
+  dec->alpha_plane_ = dec->alpha_plane_mem_;
+  dec->alpha_prev_line_ = NULL;
+  return 1;
+}
+
+void WebPDeallocateAlphaMemory(VP8Decoder* const dec) {
+  assert(dec != NULL);
+  WebPSafeFree(dec->alpha_plane_mem_);
+  dec->alpha_plane_mem_ = NULL;
+  dec->alpha_plane_ = NULL;
+  ALPHDelete(dec->alph_dec_);
+  dec->alph_dec_ = NULL;
+}
+
 //------------------------------------------------------------------------------
 // Main entry point.

 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      const VP8Io* const io,
                                      int row, int num_rows) {
-  const int width = dec->pic_hdr_.width_;
-  const int height = dec->pic_hdr_.height_;
+  const int width = io->width;
+  const int height = io->crop_bottom;
+
+  assert(dec != NULL && io != NULL);

  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
    return NULL;    // sanity check.
  }

-  if (row == 0) {
-    // Initialize decoding.
-    assert(dec->alpha_plane_ != NULL);
-    dec->alph_dec_ = ALPHNew();
-    if (dec->alph_dec_ == NULL) return NULL;
-    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
-                  width, height, dec->alpha_plane_)) {
-      ALPHDelete(dec->alph_dec_);
-      dec->alph_dec_ = NULL;
-      return NULL;
-    }
-    // if we allowed use of alpha dithering, check whether it's needed at all
-    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
-      dec->alpha_dithering_ = 0;  // disable dithering
-    } else {
-      num_rows = height;          // decode everything in one pass
-    }
-  }
-
  if (!dec->is_alpha_decoded_) {
-    int ok = 0;
-    assert(dec->alph_dec_ != NULL);
-    ok = ALPHDecode(dec, row, num_rows);
-    if (ok && dec->alpha_dithering_ > 0) {
-      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
-                                dec->alpha_dithering_);
+    if (dec->alph_dec_ == NULL) {    // Initialize decoder.
+      dec->alph_dec_ = ALPHNew();
+      if (dec->alph_dec_ == NULL) return NULL;
+      if (!AllocateAlphaPlane(dec, io)) goto Error;
+      if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
+                    io, dec->alpha_plane_)) {
+        goto Error;
+      }
+      // if we allowed use of alpha dithering, check whether it's needed at all
+      if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
+        dec->alpha_dithering_ = 0;   // disable dithering
+      } else {
+        num_rows = height - row;     // decode everything in one pass
+      }
    }
-    if (!ok || dec->is_alpha_decoded_) {
+
+    assert(dec->alph_dec_ != NULL);
+    assert(row + num_rows <= height);
+    if (!ALPHDecode(dec, row, num_rows)) goto Error;
+
+    if (dec->is_alpha_decoded_) {   // finished?
      ALPHDelete(dec->alph_dec_);
      dec->alph_dec_ = NULL;
+      if (dec->alpha_dithering_ > 0) {
+        uint8_t* const alpha = dec->alpha_plane_ + io->crop_top * width
+                             + io->crop_left;
+        if (!WebPDequantizeLevels(alpha,
+                                  io->crop_right - io->crop_left,
+                                  io->crop_bottom - io->crop_top,
+                                  width, dec->alpha_dithering_)) {
+          goto Error;
+        }
+      }
    }
-    if (!ok) return NULL;  // Error.
  }

  // Return a pointer to the current decoded row.
  return dec->alpha_plane_ + row * width;
+
+ Error:
+  WebPDeallocateAlphaMemory(dec);
+  return NULL;
 }
--- a/src/dec/alphai.h
+++ b/src/dec/alphai.h
@ -32,19 +32,18 @@ struct ALPHDecoder {
  int pre_processing_;
  struct VP8LDecoder* vp8l_dec_;
  VP8Io io_;
-  int use_8b_decode;  // Although alpha channel requires only 1 byte per
-                      // pixel, sometimes VP8LDecoder may need to allocate
-                      // 4 bytes per pixel internally during decode.
+  int use_8b_decode_;  // Although alpha channel requires only 1 byte per
+                       // pixel, sometimes VP8LDecoder may need to allocate
+                       // 4 bytes per pixel internally during decode.
+  uint8_t* output_;
+  const uint8_t* prev_line_;   // last output row (or NULL)
 };

 //------------------------------------------------------------------------------
 // internal functions. Not public.

-// Allocates a new alpha decoder instance.
-ALPHDecoder* ALPHNew(void);
-
-// Clears and deallocates an alpha decoder instance.
-void ALPHDelete(ALPHDecoder* const dec);
+// Deallocate memory associated to dec->alpha_plane_ decoding
+void WebPDeallocateAlphaMemory(VP8Decoder* const dec);

 //------------------------------------------------------------------------------

--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -33,6 +33,11 @@ static int IsValidColorspace(int webp_csp_mode) {
  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }

+// strictly speaking, the very last (or first, if flipped) row
+// doesn't require padding.
+#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
  const WEBP_CSP_MODE mode = buffer->colorspace;
@ -42,20 +47,22 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    ok = 0;
  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int uv_width  = (width  + 1) / 2;
+    const int uv_height = (height + 1) / 2;
    const int y_stride = abs(buf->y_stride);
    const int u_stride = abs(buf->u_stride);
    const int v_stride = abs(buf->v_stride);
    const int a_stride = abs(buf->a_stride);
-    const uint64_t y_size = (uint64_t)y_stride * height;
-    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)a_stride * height;
+    const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
+    const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
+    const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
+    const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
    ok &= (y_stride >= width);
-    ok &= (u_stride >= (width + 1) / 2);
-    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (u_stride >= uv_width);
+    ok &= (v_stride >= uv_width);
    ok &= (buf->y != NULL);
    ok &= (buf->u != NULL);
    ok &= (buf->v != NULL);
@ -67,13 +74,14 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
-    const uint64_t size = (uint64_t)stride * height;
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
+#undef MIN_BUFFER_SIZE

 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  const int w = buffer->width;
@ -84,7 +92,7 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
    return VP8_STATUS_INVALID_PARAM;
  }

-  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
+  if (buffer->is_external_memory <= 0 && buffer->private_memory == NULL) {
    uint8_t* output;
    int uv_stride = 0, a_stride = 0;
    uint64_t uv_size = 0, a_size = 0, total_size;
@ -181,11 +189,14 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
      h = ch;
    }
    if (options->use_scaling) {
-      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      int scaled_width = options->scaled_width;
+      int scaled_height = options->scaled_height;
+      if (!WebPRescalerGetScaledDimensions(
+              w, h, &scaled_width, &scaled_height)) {
        return VP8_STATUS_INVALID_PARAM;
      }
-      w = options->scaled_width;
-      h = options->scaled_height;
+      w = scaled_width;
+      h = scaled_height;
    }
  }
  out->width = w;
@ -195,12 +206,10 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
  status = AllocateBuffer(out);
  if (status != VP8_STATUS_OK) return status;

-#if WEBP_DECODER_ABI_VERSION > 0x0203
  // Use the stride trick if vertical flip is needed.
  if (options != NULL && options->flip) {
    status = WebPFlipBuffer(out);
  }
-#endif
  return status;
 }

@ -218,7 +227,7 @@ int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {

 void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
  if (buffer != NULL) {
-    if (!buffer->is_external_memory) {
+    if (buffer->is_external_memory <= 0) {
      WebPSafeFree(buffer->private_memory);
    }
    buffer->private_memory = NULL;
@ -247,5 +256,45 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
  }
 }

-//------------------------------------------------------------------------------
+VP8StatusCode WebPCopyDecBufferPixels(const WebPDecBuffer* const src_buf,
+                                      WebPDecBuffer* const dst_buf) {
+  assert(src_buf != NULL && dst_buf != NULL);
+  assert(src_buf->colorspace == dst_buf->colorspace);

+  dst_buf->width = src_buf->width;
+  dst_buf->height = src_buf->height;
+  if (CheckDecBuffer(dst_buf) != VP8_STATUS_OK) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (WebPIsRGBMode(src_buf->colorspace)) {
+    const WebPRGBABuffer* const src = &src_buf->u.RGBA;
+    const WebPRGBABuffer* const dst = &dst_buf->u.RGBA;
+    WebPCopyPlane(src->rgba, src->stride, dst->rgba, dst->stride,
+                  src_buf->width * kModeBpp[src_buf->colorspace],
+                  src_buf->height);
+  } else {
+    const WebPYUVABuffer* const src = &src_buf->u.YUVA;
+    const WebPYUVABuffer* const dst = &dst_buf->u.YUVA;
+    WebPCopyPlane(src->y, src->y_stride, dst->y, dst->y_stride,
+                  src_buf->width, src_buf->height);
+    WebPCopyPlane(src->u, src->u_stride, dst->u, dst->u_stride,
+                  (src_buf->width + 1) / 2, (src_buf->height + 1) / 2);
+    WebPCopyPlane(src->v, src->v_stride, dst->v, dst->v_stride,
+                  (src_buf->width + 1) / 2, (src_buf->height + 1) / 2);
+    if (WebPIsAlphaMode(src_buf->colorspace)) {
+      WebPCopyPlane(src->a, src->a_stride, dst->a, dst->a_stride,
+                    src_buf->width, src_buf->height);
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
+                        const WebPBitstreamFeatures* const features) {
+  assert(output != NULL);
+  return (output->is_external_memory >= 2) &&
+         WebPIsPremultipliedMode(output->colorspace) &&
+         (features != NULL && features->has_alpha);
+}
+
+//------------------------------------------------------------------------------
--- a/src/dec/common.h
+++ b/src/dec/common.h
@ -0,0 +1,54 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Definitions and macros common to encoding and decoding
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DEC_COMMON_H_
+#define WEBP_DEC_COMMON_H_
+
+// intra prediction modes
+enum { B_DC_PRED = 0,   // 4x4 modes
+       B_TM_PRED = 1,
+       B_VE_PRED = 2,
+       B_HE_PRED = 3,
+       B_RD_PRED = 4,
+       B_VR_PRED = 5,
+       B_LD_PRED = 6,
+       B_VL_PRED = 7,
+       B_HD_PRED = 8,
+       B_HU_PRED = 9,
+       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
+
+       // Luma16 or UV modes
+       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
+       B_PRED = NUM_BMODES,   // refined I4x4 mode
+       NUM_PRED_MODES = 4,
+
+       // special modes
+       B_DC_PRED_NOTOP = 4,
+       B_DC_PRED_NOLEFT = 5,
+       B_DC_PRED_NOTOPLEFT = 6,
+       NUM_B_DC_MODES = 7 };
+
+enum { MB_FEATURE_TREE_PROBS = 3,
+       NUM_MB_SEGMENTS = 4,
+       NUM_REF_LF_DELTAS = 4,
+       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
+       MAX_NUM_PARTITIONS = 8,
+       // Probabilities
+       NUM_TYPES = 4,   // 0: i16-AC,  1: i16-DC,  2:chroma-AC,  3:i4-AC
+       NUM_BANDS = 8,
+       NUM_CTX = 3,
+       NUM_PROBAS = 11
+     };
+
+#endif    // WEBP_DEC_COMMON_H_
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -15,10 +15,180 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"

-#define ALIGN_MASK (32 - 1)
+//------------------------------------------------------------------------------
+// Main reconstruction function.
+
+static const int kScan[16] = {
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
+};
+
+static int CheckMode(int mb_x, int mb_y, int mode) {
+  if (mode == B_DC_PRED) {
+    if (mb_x == 0) {
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    } else {
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+    }
+  }
+  return mode;
+}
+
+static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
+  memcpy(dst, src, 4);
+}
+
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
+                                    uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
+}
+
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}

 static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+                           const VP8ThreadContext* ctx) {
+  int j;
+  int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
+  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
+  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
+  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+
+  // Initialize left-most block.
+  for (j = 0; j < 16; ++j) {
+    y_dst[j * BPS - 1] = 129;
+  }
+  for (j = 0; j < 8; ++j) {
+    u_dst[j * BPS - 1] = 129;
+    v_dst[j * BPS - 1] = 129;
+  }
+
+  // Init top-left sample on left column too.
+  if (mb_y > 0) {
+    y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+  } else {
+    // we only need to do this init once at block (0,0).
+    // Afterward, it remains valid for the whole topmost row.
+    memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+    memset(u_dst - BPS - 1, 127, 8 + 1);
+    memset(v_dst - BPS - 1, 127, 8 + 1);
+  }
+
+  // Reconstruct one row.
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;
+
+    // Rotate in the left samples from previously decoded block. We move four
+    // pixels at a time for alignment reason, and because of in-loop filter.
+    if (mb_x > 0) {
+      for (j = -1; j < 16; ++j) {
+        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+      }
+      for (j = -1; j < 8; ++j) {
+        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+      }
+    }
+    {
+      // bring top samples into the cache
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
+      int n;
+
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      }
+
+      // predict and add residuals
+      if (block->is_i4x4_) {   // 4x4
+        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
+          } else {
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
+          }
+        }
+        // replicate the top-right pixels below
+        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
+          uint8_t* const dst = y_dst + kScan[n];
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
+        }
+      } else {    // 16x16
+        const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]);
+        VP8PredLuma16[pred_func](y_dst);
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
+          }
+        }
+      }
+      {
+        // Chroma
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
+        VP8PredChroma8[pred_func](u_dst);
+        VP8PredChroma8[pred_func](v_dst);
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      }
+
+      // stash away top samples for next block
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      }
+    }
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      }
+    }
+  }
+}

 //------------------------------------------------------------------------------
 // Filtering
@ -112,7 +282,6 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
        int level = base_level;
        if (hdr->use_lf_delta_) {
-          // TODO(skal): only CURRENT is handled for now.
          level += hdr->ref_lf_delta_[0];
          if (i4x4) {
            level += hdr->mode_lf_delta_[0];
@ -147,6 +316,9 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 //------------------------------------------------------------------------------
 // Dithering

+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+
 #define DITHER_AMP_TAB_SIZE 12
 static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
  // roughly, it's dqm->uv_mat_[1]
@ -177,7 +349,6 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
        dec->dither_ = 1;
      }
    }
-#if WEBP_DECODER_ABI_VERSION > 0x0204
    // potentially allow alpha dithering
    dec->alpha_dithering_ = options->alpha_dithering_strength;
    if (dec->alpha_dithering_ > 100) {
@ -185,31 +356,17 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
    } else if (dec->alpha_dithering_ < 0) {
      dec->alpha_dithering_ = 0;
    }
-#endif
  }
 }

-// minimal amp that will provide a non-zero dithering effect
-#define MIN_DITHER_AMP 4
-#define DITHER_DESCALE 4
-#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
-#define DITHER_AMP_BITS 8
-#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
-
+// Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
 static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
-  int i, j;
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) {
-      // TODO: could be made faster with SSE2
-      const int bits =
-          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
-      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
-      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
-      const int v = (int)dst[i] + delta;
-      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
-    }
-    dst += bps;
+  uint8_t dither[64];
+  int i;
+  for (i = 0; i < 8 * 8; ++i) {
+    dither[i] = VP8RandomBits2(rg, VP8_DITHER_AMP_BITS + 1, amp);
  }
+  VP8DitherCombine8x8(dither, dst, bps);
 }

 static void DitherRow(VP8Decoder* const dec) {
@ -295,7 +452,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    if (dec->alpha_data_ != NULL && y_start < y_end) {
      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
      // good idea.
-      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
+      io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                           "Could not decode alpha data.");
@ -554,7 +711,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  const uint64_t needed = (uint64_t)intra_pred_mode_size
                        + top_size + mb_info_size + f_info_size
                        + yuv_size + mb_data_size
-                        + cache_size + alpha_size + ALIGN_MASK;
+                        + cache_size + alpha_size + WEBP_ALIGN_CST;
  uint8_t* mem;

  if (needed != (size_t)needed) return 0;  // check for overflow
@ -591,8 +748,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
    dec->thread_ctx_.f_info_ += mb_w;
  }

-  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
-  assert((yuv_size & ALIGN_MASK) == 0);
+  mem = (uint8_t*)WEBP_ALIGN(mem);
+  assert((yuv_size & WEBP_ALIGN_CST) == 0);
  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;

@ -644,7 +801,7 @@ static void InitIo(VP8Decoder* const dec, VP8Io* io) {
  io->a = NULL;
 }

-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) {
  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
  if (!AllocateMemory(dec)) return 0;
  InitIo(dec, io);
@ -653,176 +810,3 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
 }

 //------------------------------------------------------------------------------
-// Main reconstruction function.
-
-static const int kScan[16] = {
-  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
-  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
-  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
-  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
-};
-
-static int CheckMode(int mb_x, int mb_y, int mode) {
-  if (mode == B_DC_PRED) {
-    if (mb_x == 0) {
-      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
-    } else {
-      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
-    }
-  }
-  return mode;
-}
-
-static void Copy32b(uint8_t* dst, uint8_t* src) {
-  memcpy(dst, src, 4);
-}
-
-static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
-                                    uint8_t* const dst) {
-  switch (bits >> 30) {
-    case 3:
-      VP8Transform(src, dst, 0);
-      break;
-    case 2:
-      VP8TransformAC3(src, dst);
-      break;
-    case 1:
-      VP8TransformDC(src, dst);
-      break;
-    default:
-      break;
-  }
-}
-
-static void DoUVTransform(uint32_t bits, const int16_t* const src,
-                          uint8_t* const dst) {
-  if (bits & 0xff) {    // any non-zero coeff at all?
-    if (bits & 0xaa) {  // any non-zero AC coefficient?
-      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
-    } else {
-      VP8TransformDCUV(src, dst);
-    }
-  }
-}
-
-static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx) {
-  int j;
-  int mb_x;
-  const int mb_y = ctx->mb_y_;
-  const int cache_id = ctx->id_;
-  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
-  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
-  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
-  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-    const VP8MBData* const block = ctx->mb_data_ + mb_x;
-
-    // Rotate in the left samples from previously decoded block. We move four
-    // pixels at a time for alignment reason, and because of in-loop filter.
-    if (mb_x > 0) {
-      for (j = -1; j < 16; ++j) {
-        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
-      }
-      for (j = -1; j < 8; ++j) {
-        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
-      }
-    } else {
-      for (j = 0; j < 16; ++j) {
-        y_dst[j * BPS - 1] = 129;
-      }
-      for (j = 0; j < 8; ++j) {
-        u_dst[j * BPS - 1] = 129;
-        v_dst[j * BPS - 1] = 129;
-      }
-      // Init top-left sample on left column too
-      if (mb_y > 0) {
-        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
-      }
-    }
-    {
-      // bring top samples into the cache
-      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
-      const int16_t* const coeffs = block->coeffs_;
-      uint32_t bits = block->non_zero_y_;
-      int n;
-
-      if (mb_y > 0) {
-        memcpy(y_dst - BPS, top_yuv[0].y, 16);
-        memcpy(u_dst - BPS, top_yuv[0].u, 8);
-        memcpy(v_dst - BPS, top_yuv[0].v, 8);
-      } else if (mb_x == 0) {
-        // we only need to do this init once at block (0,0).
-        // Afterward, it remains valid for the whole topmost row.
-        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-        memset(u_dst - BPS - 1, 127, 8 + 1);
-        memset(v_dst - BPS - 1, 127, 8 + 1);
-      }
-
-      // predict and add residuals
-      if (block->is_i4x4_) {   // 4x4
-        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-
-        if (mb_y > 0) {
-          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
-            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
-          } else {
-            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
-          }
-        }
-        // replicate the top-right pixels below
-        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-
-        // predict and add residuals for all 4x4 blocks in turn.
-        for (n = 0; n < 16; ++n, bits <<= 2) {
-          uint8_t* const dst = y_dst + kScan[n];
-          VP8PredLuma4[block->imodes_[n]](dst);
-          DoTransform(bits, coeffs + n * 16, dst);
-        }
-      } else {    // 16x16
-        const int pred_func = CheckMode(mb_x, mb_y,
-                                        block->imodes_[0]);
-        VP8PredLuma16[pred_func](y_dst);
-        if (bits != 0) {
-          for (n = 0; n < 16; ++n, bits <<= 2) {
-            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
-          }
-        }
-      }
-      {
-        // Chroma
-        const uint32_t bits_uv = block->non_zero_uv_;
-        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
-        VP8PredChroma8[pred_func](u_dst);
-        VP8PredChroma8[pred_func](v_dst);
-        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
-        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
-      }
-
-      // stash away top samples for next block
-      if (mb_y < dec->mb_h_ - 1) {
-        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
-        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
-        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
-      }
-    }
-    // Transfer reconstructed samples from yuv_b_ cache to final destination.
-    {
-      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
-      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
-      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
-      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
-      for (j = 0; j < 16; ++j) {
-        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
-      }
-      for (j = 0; j < 8; ++j) {
-        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
-        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
-      }
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -70,7 +70,9 @@ struct WebPIDecoder {
  VP8Io io_;

  MemBuffer mem_;          // input memory buffer.
-  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
+  WebPDecBuffer output_;   // output buffer (when no external one is supplied,
+                           // or if the external one has slow-memory)
+  WebPDecBuffer* final_output_;  // Slow-memory output to copy to eventually.
  size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.

  int last_mb_y_;          // last row reached for intra-mode decoding
@ -118,9 +120,9 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
  if (idec->dec_ != NULL) {
    if (!idec->is_lossless_) {
      VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
-      const int last_part = dec->num_parts_ - 1;
+      const uint32_t last_part = dec->num_parts_minus_one_;
      if (offset != 0) {
-        int p;
+        uint32_t p;
        for (p = 0; p <= last_part; ++p) {
          VP8RemapBitReader(dec->parts_ + p, offset);
        }
@ -130,8 +132,11 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
          VP8RemapBitReader(&dec->br_, offset);
        }
      }
-      assert(last_part >= 0);
-      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
+      {
+        const uint8_t* const last_start = dec->parts_[last_part].buf_;
+        VP8BitReaderSetBuffer(&dec->parts_[last_part], last_start,
+                              mem->buf_ + mem->end_ - last_start);
+      }
      if (NeedCompressedAlpha(idec)) {
        ALPHDecoder* const alph_dec = dec->alph_dec_;
        dec->alpha_data_ += offset;
@ -240,16 +245,20 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {

 // To be called last.
 static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
-#if WEBP_DECODER_ABI_VERSION > 0x0203
  const WebPDecoderOptions* const options = idec->params_.options;
  WebPDecBuffer* const output = idec->params_.output;

  idec->state_ = STATE_DONE;
  if (options != NULL && options->flip) {
-    return WebPFlipBuffer(output);
+    const VP8StatusCode status = WebPFlipBuffer(output);
+    if (status != VP8_STATUS_OK) return status;
+  }
+  if (idec->final_output_ != NULL) {
+    WebPCopyDecBufferPixels(output, idec->final_output_);  // do the slow-copy
+    WebPFreeDecBuffer(&idec->output_);
+    *output = *idec->final_output_;
+    idec->final_output_ = NULL;
  }
-#endif
-  idec->state_ = STATE_DONE;
  return VP8_STATUS_OK;
 }

@ -377,8 +386,7 @@ static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
    }
    memcpy(part0_buf, br->buf_, part_size);
    mem->part0_buf_ = part0_buf;
-    br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + part_size;
+    VP8BitReaderSetBuffer(br, part0_buf, part_size);
  } else {
    // Else: just keep pointers to the partition #0's data in dec_->br_.
  }
@ -456,19 +464,20 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
    }
    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      VP8BitReader* const token_br =
-          &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+          &dec->parts_[dec->mb_y_ & dec->num_parts_minus_one_];
      MBContext context;
      SaveContext(dec, token_br, &context);
      if (!VP8DecodeMB(dec, token_br)) {
        // We shouldn't fail when MAX_MB data was available
-        if (dec->num_parts_ == 1 && MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
+        if (dec->num_parts_minus_one_ == 0 &&
+            MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
          return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
        }
        RestoreContext(&context, dec, token_br);
        return VP8_STATUS_SUSPENDED;
      }
      // Release buffer only if there is only one partition
-      if (dec->num_parts_ == 1) {
+      if (dec->num_parts_minus_one_ == 0) {
        idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
        assert(idec->mem_.start_ <= idec->mem_.end_);
      }
@ -506,9 +515,15 @@ static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {

  // Wait until there's enough data for decoding header.
  if (curr_size < (idec->chunk_size_ >> 3)) {
-    return VP8_STATUS_SUSPENDED;
+    dec->status_ = VP8_STATUS_SUSPENDED;
+    return ErrorStatusLossless(idec, dec->status_);
  }
+
  if (!VP8LDecodeHeader(dec, io)) {
+    if (dec->status_ == VP8_STATUS_BITSTREAM_ERROR &&
+        curr_size < idec->chunk_size_) {
+      dec->status_ = VP8_STATUS_SUSPENDED;
+    }
    return ErrorStatusLossless(idec, dec->status_);
  }
  // Allocate/verify output buffer now.
@ -527,23 +542,15 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
  const size_t curr_size = MemDataSize(&idec->mem_);
  assert(idec->is_lossless_);

-  // At present Lossless decoder can't decode image incrementally. So wait till
-  // all the image data is aggregated before image can be decoded.
-  if (curr_size < idec->chunk_size_) {
-    return VP8_STATUS_SUSPENDED;
-  }
+  // Switch to incremental decoding if we don't have all the bytes available.
+  dec->incremental_ = (curr_size < idec->chunk_size_);

  if (!VP8LDecodeImage(dec)) {
-    // The decoding is called after all the data-bytes are aggregated. Change
-    // the error to VP8_BITSTREAM_ERROR in case lossless decoder fails to decode
-    // all the pixels (VP8_STATUS_SUSPENDED).
-    if (dec->status_ == VP8_STATUS_SUSPENDED) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-    }
    return ErrorStatusLossless(idec, dec->status_);
  }
-
-  return FinishDecoding(idec);
+  assert(dec->status_ == VP8_STATUS_OK || dec->status_ == VP8_STATUS_SUSPENDED);
+  return (dec->status_ == VP8_STATUS_SUSPENDED) ? dec->status_
+                                                : FinishDecoding(idec);
 }

  // Main decoding loop
@ -576,9 +583,10 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 }

 //------------------------------------------------------------------------------
-// Public functions
+// Internal constructor

-WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
+static WebPIDecoder* NewDecoder(WebPDecBuffer* const output_buffer,
+                                const WebPBitstreamFeatures* const features) {
  WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
  if (idec == NULL) {
    return NULL;
@ -594,25 +602,46 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
  VP8InitIo(&idec->io_);

  WebPResetDecParams(&idec->params_);
-  idec->params_.output = (output_buffer != NULL) ? output_buffer
-                                                 : &idec->output_;
+  if (output_buffer == NULL || WebPAvoidSlowMemory(output_buffer, features)) {
+    idec->params_.output = &idec->output_;
+    idec->final_output_ = output_buffer;
+    if (output_buffer != NULL) {
+      idec->params_.output->colorspace = output_buffer->colorspace;
+    }
+  } else {
+    idec->params_.output = output_buffer;
+    idec->final_output_ = NULL;
+  }
  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.

  return idec;
 }

+//------------------------------------------------------------------------------
+// Public functions
+
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
+  return NewDecoder(output_buffer, NULL);
+}
+
 WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
                          WebPDecoderConfig* config) {
  WebPIDecoder* idec;
+  WebPBitstreamFeatures tmp_features;
+  WebPBitstreamFeatures* const features =
+      (config == NULL) ? &tmp_features : &config->input;
+  memset(&tmp_features, 0, sizeof(tmp_features));

  // Parse the bitstream's features, if requested:
-  if (data != NULL && data_size > 0 && config != NULL) {
-    if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) {
+  if (data != NULL && data_size > 0) {
+    if (WebPGetFeatures(data, data_size, features) != VP8_STATUS_OK) {
      return NULL;
    }
  }
+
  // Create an instance of the incremental decoder
-  idec = WebPINewDecoder(config ? &config->output : NULL);
+  idec = (config != NULL) ? NewDecoder(&config->output, features)
+                          : NewDecoder(NULL, features);
  if (idec == NULL) {
    return NULL;
  }
@ -646,11 +675,11 @@ void WebPIDelete(WebPIDecoder* idec) {

 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                          size_t output_buffer_size, int output_stride) {
-  const int is_external_memory = (output_buffer != NULL);
+  const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
  WebPIDecoder* idec;

  if (mode >= MODE_YUV) return NULL;
-  if (!is_external_memory) {    // Overwrite parameters to sane values.
+  if (is_external_memory == 0) {    // Overwrite parameters to sane values.
    output_buffer_size = 0;
    output_stride = 0;
  } else {  // A buffer was passed. Validate the other params.
@ -672,11 +701,11 @@ WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
                           uint8_t* u, size_t u_size, int u_stride,
                           uint8_t* v, size_t v_size, int v_stride,
                           uint8_t* a, size_t a_size, int a_stride) {
-  const int is_external_memory = (luma != NULL);
+  const int is_external_memory = (luma != NULL) ? 1 : 0;
  WebPIDecoder* idec;
  WEBP_CSP_MODE colorspace;

-  if (!is_external_memory) {    // Overwrite parameters to sane values.
+  if (is_external_memory == 0) {    // Overwrite parameters to sane values.
    luma_size = u_size = v_size = a_size = 0;
    luma_stride = u_stride = v_stride = a_stride = 0;
    u = v = a = NULL;
@ -784,6 +813,9 @@ static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
  if (idec->state_ <= STATE_VP8_PARTS0) {
    return NULL;
  }
+  if (idec->final_output_ != NULL) {
+    return NULL;   // not yet slow-copied
+  }
  return idec->params_.output;
 }

@ -793,8 +825,7 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
  if (left != NULL) *left = 0;
  if (top != NULL) *top = 0;
-  // TODO(skal): later include handling of rotations.
-  if (src) {
+  if (src != NULL) {
    if (width != NULL) *width = src->width;
    if (height != NULL) *height = idec->params_.last_y;
  } else {
@ -859,4 +890,3 @@ int WebPISetIOHooks(WebPIDecoder* const idec,

  return 1;
 }
-
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -55,32 +55,6 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
  return io->mb_h;
 }

-//------------------------------------------------------------------------------
-// YUV444 -> RGB conversion
-
-#if 0   // TODO(skal): this is for future rescaling.
-static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
-  WebPDecBuffer* output = p->output;
-  const WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const uint8_t* y_src = io->y;
-  const uint8_t* u_src = io->u;
-  const uint8_t* v_src = io->v;
-  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
-  const int mb_w = io->mb_w;
-  const int last = io->mb_h;
-  int j;
-  for (j = 0; j < last; ++j) {
-    convert(y_src, u_src, v_src, dst, mb_w);
-    y_src += io->y_stride;
-    u_src += io->uv_stride;
-    v_src += io->uv_stride;
-    dst += buf->stride;
-  }
-  return io->mb_h;
-}
-#endif
-
 //------------------------------------------------------------------------------
 // Fancy upsampling

@ -145,14 +119,24 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {

 //------------------------------------------------------------------------------

-static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+static void FillAlphaPlane(uint8_t* dst, int w, int h, int stride) {
+  int j;
+  for (j = 0; j < h; ++j) {
+    memset(dst, 0xff, w * sizeof(*dst));
+    dst += stride;
+  }
+}
+
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
+                        int expected_num_lines_out) {
  const uint8_t* alpha = io->a;
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
  int j;
-
+  (void)expected_num_lines_out;
+  assert(expected_num_lines_out == mb_h);
  if (alpha != NULL) {
    for (j = 0; j < mb_h; ++j) {
      memcpy(dst, alpha, mb_w * sizeof(*dst));
@ -161,10 +145,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
    }
  } else if (buf->a != NULL) {
    // the user requested alpha, but there is none, set it to opaque.
-    for (j = 0; j < mb_h; ++j) {
-      memset(dst, 0xff, mb_w * sizeof(*dst));
-      dst += buf->a_stride;
-    }
+    FillAlphaPlane(dst, mb_w, mb_h, buf->a_stride);
  }
  return 0;
 }
@ -195,7 +176,8 @@ static int GetAlphaSourceRow(const VP8Io* const io,
  return start_y;
 }

-static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
+                        int expected_num_lines_out) {
  const uint8_t* alpha = io->a;
  if (alpha != NULL) {
    const int mb_w = io->mb_w;
@ -206,21 +188,13 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
    int num_rows;
    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
-    uint32_t alpha_mask = 0xff;
-    int i, j;
-
-    for (j = 0; j < num_rows; ++j) {
-      for (i = 0; i < mb_w; ++i) {
-        const uint32_t alpha_value = alpha[i];
-        dst[4 * i] = alpha_value;
-        alpha_mask &= alpha_value;
-      }
-      alpha += io->width;
-      dst += buf->stride;
-    }
-    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
-    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
+    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
+                                            num_rows, dst, buf->stride);
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_rows);
+    // has_alpha is true if there's non-trivial alpha to premultiply with.
+    if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply(base_rgba, alpha_first,
                             mb_w, num_rows, buf->stride);
    }
@ -228,7 +202,8 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
  return 0;
 }

-static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
+                             int expected_num_lines_out) {
  const uint8_t* alpha = io->a;
  if (alpha != NULL) {
    const int mb_w = io->mb_w;
@ -244,7 +219,6 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
 #endif
    uint32_t alpha_mask = 0x0f;
    int i, j;
-
    for (j = 0; j < num_rows; ++j) {
      for (i = 0; i < mb_w; ++i) {
        // Fill in the alpha value (converted to 4 bits).
@ -255,6 +229,8 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
      alpha += io->width;
      alpha_dst += buf->stride;
    }
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_rows);
    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
    }
@ -296,16 +272,24 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
  return num_lines_out;
 }

-static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
+                                int expected_num_lines_out) {
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  if (io->a != NULL) {
-    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_lines_out);
    if (num_lines_out > 0) {   // unmultiply the Y
      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
                   p->scaler_a.dst_width, num_lines_out, 1);
    }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    assert(p->last_y + expected_num_lines_out <= io->scaled_height);
+    FillAlphaPlane(buf->a + p->last_y * buf->a_stride,
+                   io->scaled_width, expected_num_lines_out, buf->a_stride);
  }
  return 0;
 }
@ -322,37 +306,31 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
  size_t tmp_size;
-  int32_t* work;
+  rescaler_t* work;

  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
  if (has_alpha) {
    tmp_size += work_size * sizeof(*work);
  }
-  p->memory = WebPSafeCalloc(1ULL, tmp_size);
+  p->memory = WebPSafeMalloc(1ULL, tmp_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size + uv_work_size);
  p->emit = EmitRescaledYUV;

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + work_size + 2 * uv_work_size);
    p->emit_alpha = EmitRescaledAlphaYUV;
    WebPInitAlphaProcessing();
@ -367,17 +345,17 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
  const WebPYUV444Converter convert =
      WebPYUV444Converters[p->output->colorspace];
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* dst = buf->rgba + y_pos * buf->stride;
  int num_lines_out = 0;
  // For RGB rescaling, because of the YUV420, current scan position
  // U/V can be +1/-1 line from the Y one.  Hence the double test.
  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
         WebPRescalerHasPendingOutput(&p->scaler_u)) {
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y, 0);
-    WebPRescalerExportRow(&p->scaler_u, 0);
-    WebPRescalerExportRow(&p->scaler_v, 0);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
@ -395,55 +373,54 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
    const int y_lines_in =
        WebPRescalerImport(&p->scaler_y, mb_h - j,
                           io->y + j * io->y_stride, io->y_stride);
-    const int u_lines_in =
-        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
-                           io->u + uv_j * io->uv_stride, io->uv_stride);
-    const int v_lines_in =
-        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
-                           io->v + uv_j * io->uv_stride, io->uv_stride);
-    (void)v_lines_in;   // remove a gcc warning
-    assert(u_lines_in == v_lines_in);
    j += y_lines_in;
-    uv_j += u_lines_in;
-    num_lines_out += ExportRGB(p, num_lines_out);
+    if (WebPRescaleNeededLines(&p->scaler_u, uv_mb_h - uv_j)) {
+      const int u_lines_in =
+          WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                             io->u + uv_j * io->uv_stride, io->uv_stride);
+      const int v_lines_in =
+          WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      (void)v_lines_in;   // remove a gcc warning
+      assert(u_lines_in == v_lines_in);
+      uv_j += u_lines_in;
+    }
+    num_lines_out += ExportRGB(p, p->last_y + num_lines_out);
  }
  return num_lines_out;
 }

-static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int alpha_first =
      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
  int num_lines_out = 0;
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
-  uint32_t alpha_mask = 0xff;
+  uint32_t non_opaque = 0;
  const int width = p->scaler_a.dst_width;

-  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
-    int i;
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
-    for (i = 0; i < width; ++i) {
-      const uint32_t alpha_value = p->scaler_a.dst[i];
-      dst[4 * i] = alpha_value;
-      alpha_mask &= alpha_value;
-    }
+  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+         num_lines_out < max_lines_out) {
+    assert(y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
+    non_opaque |= WebPDispatchAlpha(p->scaler_a.dst, 0, width, 1, dst, 0);
    dst += buf->stride;
    ++num_lines_out;
  }
-  if (is_premult_alpha && alpha_mask != 0xff) {
+  if (is_premult_alpha && non_opaque) {
    WebPApplyAlphaMultiply(base_rgba, alpha_first,
                           width, num_lines_out, buf->stride);
  }
  return num_lines_out;
 }

-static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
+                               int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
 #ifdef WEBP_SWAP_16BIT_CSP
  uint8_t* alpha_dst = base_rgba;
 #else
@ -455,10 +432,11 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
  uint32_t alpha_mask = 0x0f;

-  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+         num_lines_out < max_lines_out) {
    int i;
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    assert(y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      // Fill in the alpha value (converted to 4 bits).
      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@ -474,15 +452,17 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  return num_lines_out;
 }

-static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
+                                int expected_num_out_lines) {
  if (io->a != NULL) {
    WebPRescaler* const scaler = &p->scaler_a;
-    int j = 0;
-    int pos = 0;
-    while (j < io->mb_h) {
-      j += WebPRescalerImport(scaler, io->mb_h - j,
-                              io->a + j * io->width, io->width);
-      pos += p->emit_alpha_row(p, pos);
+    int lines_left = expected_num_out_lines;
+    const int y_end = p->last_y + lines_left;
+    while (lines_left > 0) {
+      const int row_offset = scaler->src_y - io->mb_y;
+      WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
+                         io->a + row_offset * io->width, io->width);
+      lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
    }
  }
  return 0;
@ -495,7 +475,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const int uv_in_width  = (io->mb_w + 1) >> 1;
  const int uv_in_height = (io->mb_h + 1) >> 1;
  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
-  int32_t* work;  // rescalers work area
+  rescaler_t* work;  // rescalers work area
  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
  size_t tmp_size1, tmp_size2, total_size;

@ -506,30 +486,27 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size2 += out_width;
  }
  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeCalloc(1ULL, total_size);
+  p->memory = WebPSafeMalloc(1ULL, total_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  tmp = (uint8_t*)(work + tmp_size1);
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work + 0 * work_size);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 1 * work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 2 * work_size);
  p->emit = EmitRescaledRGB;
+  WebPInitYUV444Converters();

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + 3 * work_size);
    p->emit_alpha = EmitRescaledAlphaRGB;
    if (p->output->colorspace == MODE_RGBA_4444 ||
@ -569,6 +546,7 @@ static int CustomSetup(VP8Io* io) {
    }
  } else {
    if (is_rgb) {
+      WebPInitSamplers();
      p->emit = EmitSampledRGB;   // default
      if (io->fancy_upsampling) {
 #ifdef FANCY_UPSAMPLING
@ -583,8 +561,6 @@ static int CustomSetup(VP8Io* io) {
        p->emit = EmitFancyRGB;
        WebPInitUpsamplers();
 #endif
-      } else {
-        WebPInitSamplers();
      }
    } else {
      p->emit = EmitYUV;
@ -621,7 +597,7 @@ static int CustomPut(const VP8Io* io) {
  }
  num_lines_out = p->emit(io, p);
  if (p->emit_alpha != NULL) {
-    p->emit_alpha(io, p);
+    p->emit_alpha(io, p, num_lines_out);
  }
  p->last_y += num_lines_out;
  return 1;
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -494,6 +494,12 @@ static const uint8_t
 };

 // Paragraph 9.9
+
+static const int kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // extra entry as sentinel
+};
+
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
  int t, b, c, p;
@ -507,6 +513,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
        }
      }
    }
+    for (b = 0; b < 16 + 1; ++b) {
+      proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
+    }
  }
  dec->use_skip_proba_ = VP8Get(br);
  if (dec->use_skip_proba_) {
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -50,7 +50,7 @@ VP8Decoder* VP8New(void) {
    SetOk(dec);
    WebPGetWorkerInterface()->Init(&dec->worker_);
    dec->ready_ = 0;
-    dec->num_parts_ = 1;
+    dec->num_parts_minus_one_ = 0;
  }
  return dec;
 }
@ -75,10 +75,7 @@ void VP8Delete(VP8Decoder* const dec) {

 int VP8SetError(VP8Decoder* const dec,
                VP8StatusCode error, const char* const msg) {
-  // TODO This check would be unnecessary if alpha decompression was separated
-  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
-  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
-  // failure.
+  // The oldest error reported takes precedence over the new one.
  if (dec->status_ == VP8_STATUS_OK) {
    dec->status_ = error;
    dec->error_msg_ = msg;
@ -193,25 +190,27 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
  const uint8_t* sz = buf;
  const uint8_t* buf_end = buf + size;
  const uint8_t* part_start;
-  int last_part;
-  int p;
+  size_t size_left = size;
+  size_t last_part;
+  size_t p;

-  dec->num_parts_ = 1 << VP8GetValue(br, 2);
-  last_part = dec->num_parts_ - 1;
-  part_start = buf + last_part * 3;
-  if (buf_end < part_start) {
+  dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2)) - 1;
+  last_part = dec->num_parts_minus_one_;
+  if (size < 3 * last_part) {
    // we can't even read the sizes with sz[]! That's a failure.
    return VP8_STATUS_NOT_ENOUGH_DATA;
  }
+  part_start = buf + last_part * 3;
+  size_left -= last_part * 3;
  for (p = 0; p < last_part; ++p) {
-    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
-    const uint8_t* part_end = part_start + psize;
-    if (part_end > buf_end) part_end = buf_end;
-    VP8InitBitReader(dec->parts_ + p, part_start, part_end);
-    part_start = part_end;
+    size_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
+    if (psize > size_left) psize = size_left;
+    VP8InitBitReader(dec->parts_ + p, part_start, psize);
+    part_start += psize;
+    size_left -= psize;
    sz += 3;
  }
-  VP8InitBitReader(dec->parts_ + last_part, part_start, buf_end);
+  VP8InitBitReader(dec->parts_ + last_part, part_start, size_left);
  return (part_start < buf_end) ? VP8_STATUS_OK :
           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
 }
@ -304,15 +303,22 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+
    // Setup default output area (can be later modified during io->setup())
    io->width = pic_hdr->width_;
    io->height = pic_hdr->height_;
-    io->use_scaling  = 0;
+    // IMPORTANT! use some sane dimensions in crop_* and scaled_* fields.
+    // So they can be used interchangeably without always testing for
+    // 'use_cropping'.
    io->use_cropping = 0;
    io->crop_top  = 0;
    io->crop_left = 0;
    io->crop_right  = io->width;
    io->crop_bottom = io->height;
+    io->use_scaling  = 0;
+    io->scaled_width = io->width;
+    io->scaled_height = io->height;
+
    io->mb_w = io->width;   // sanity check
    io->mb_h = io->height;  // ditto

@ -328,7 +334,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  }

  br = &dec->br_;
-  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
+  VP8InitBitReader(br, buf, frm_hdr->partition_length_);
  buf += frm_hdr->partition_length_;
  buf_size -= frm_hdr->partition_length_;

@ -371,11 +377,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)

-static const int kBands[16 + 1] = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
-  0  // extra entry as sentinel
-};
-
 static const uint8_t kCat3[] = { 173, 148, 140, 0 };
 static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
@ -419,20 +420,19 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }

 // Returns the position of the last non-zero coeff plus one
-static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
                     int ctx, const quant_t dq, int n, int16_t* out) {
-  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n].probas_[ctx];
+  const uint8_t* p = prob[n]->probas_[ctx];
  for (; n < 16; ++n) {
    if (!VP8GetBit(br, p[0])) {
      return n;  // previous coeff was last non-zero coeff
    }
    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
-      p = prob[kBands[++n]].probas_[0];
+      p = prob[++n]->probas_[0];
      if (n == 16) return 16;
    }
    {        // non zero coeff
-      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
+      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
      int v;
      if (!VP8GetBit(br, p[2])) {
        v = 1;
@ -455,8 +455,8 @@ static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {

 static int ParseResiduals(VP8Decoder* const dec,
                          VP8MB* const mb, VP8BitReader* const token_br) {
-  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
-  const VP8BandProbas* ac_proba;
+  const VP8BandProbas* (* const bands)[16 + 1] = dec->proba_.bands_ptr_;
+  const VP8BandProbas* const * ac_proba;
  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
  int16_t* dst = block->coeffs_;
@ -586,7 +586,7 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
    // Parse bitstream for this row.
    VP8BitReader* const token_br =
-        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+        &dec->parts_[dec->mb_y_ & dec->num_parts_minus_one_];
    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                         "Premature end-of-partition0 encountered.");
@ -656,8 +656,7 @@ void VP8Clear(VP8Decoder* const dec) {
    return;
  }
  WebPGetWorkerInterface()->End(&dec->worker_);
-  ALPHDelete(dec->alph_dec_);
-  dec->alph_dec_ = NULL;
+  WebPDeallocateAlphaMemory(dec);
  WebPSafeFree(dec->mem_);
  dec->mem_ = NULL;
  dec->mem_size_ = 0;
@ -666,4 +665,3 @@ void VP8Clear(VP8Decoder* const dec) {
 }

 //------------------------------------------------------------------------------
-
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -15,6 +15,7 @@
 #define WEBP_DEC_VP8I_H_

 #include <string.h>     // for memcpy()
+#include "./common.h"
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
 #include "../utils/random.h"
@ -30,46 +31,10 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 3
+#define DEC_MIN_VERSION 5
+#define DEC_REV_VERSION 1

-// intra prediction modes
-enum { B_DC_PRED = 0,   // 4x4 modes
-       B_TM_PRED,
-       B_VE_PRED,
-       B_HE_PRED,
-       B_RD_PRED,
-       B_VR_PRED,
-       B_LD_PRED,
-       B_VL_PRED,
-       B_HD_PRED,
-       B_HU_PRED,
-       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
-
-       // Luma16 or UV modes
-       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       B_PRED = NUM_BMODES,   // refined I4x4 mode
-
-       // special modes
-       B_DC_PRED_NOTOP = 4,
-       B_DC_PRED_NOLEFT = 5,
-       B_DC_PRED_NOTOPLEFT = 6,
-       NUM_B_DC_MODES = 7 };
-
-enum { MB_FEATURE_TREE_PROBS = 3,
-       NUM_MB_SEGMENTS = 4,
-       NUM_REF_LF_DELTAS = 4,
-       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
-       MAX_NUM_PARTITIONS = 8,
-       // Probabilities
-       NUM_TYPES = 4,
-       NUM_BANDS = 8,
-       NUM_CTX = 3,
-       NUM_PROBAS = 11,
-       NUM_MV_PROBAS = 19 };
-
-// YUV-cache parameters.
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
 // and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
 // in order to be SIMD-friendly. We also need to store the top, left and
@ -91,8 +56,6 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 //  'y' = y-samples   'u' = u-samples     'v' = u-samples
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
-// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
-#define BPS       32    // this is the common stride used by yuv[]
 #define YUV_SIZE (BPS * 17 + BPS * 9)
 #define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
@ -130,7 +93,6 @@ typedef struct {
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;

-
 // probas associated to one of the contexts
 typedef uint8_t VP8ProbaArray[NUM_PROBAS];

@ -143,6 +105,7 @@ typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
+  const VP8BandProbas* bands_ptr_[NUM_TYPES][16 + 1];
 } VP8Proba;

 // Filter parameters
@ -246,8 +209,8 @@ struct VP8Decoder {
  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded

-  // number of partitions.
-  int num_parts_;
+  // number of partitions minus one.
+  uint32_t num_parts_minus_one_;
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];

@ -295,9 +258,11 @@ struct VP8Decoder {
  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
  const uint8_t* alpha_data_;     // compressed alpha data (if present)
  size_t alpha_data_size_;
-  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
-  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
-  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
+  int is_alpha_decoded_;      // true if alpha_data_ is decoded in alpha_plane_
+  uint8_t* alpha_plane_mem_;  // memory allocated for alpha_plane_
+  uint8_t* alpha_plane_;      // output. Persistent, contains the whole data.
+  const uint8_t* alpha_prev_line_;  // last decoded alpha row (or NULL)
+  int alpha_dithering_;       // derived from decoding options (0=off, 100=full)
 };

 //------------------------------------------------------------------------------
@ -317,7 +282,7 @@ int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@ -343,6 +308,7 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);

 // in alpha.c
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      const VP8Io* const io,
                                      int row, int num_rows);

 //------------------------------------------------------------------------------
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@ -43,6 +43,7 @@ struct VP8LTransform {
 typedef struct {
  int             color_cache_size_;
  VP8LColorCache  color_cache_;
+  VP8LColorCache  saved_color_cache_;  // for incremental

  int             huffman_mask_;
  int             huffman_subsample_bits_;
@ -50,12 +51,12 @@ typedef struct {
  uint32_t       *huffman_image_;
  int             num_htree_groups_;
  HTreeGroup     *htree_groups_;
+  HuffmanCode    *huffman_tables_;
 } VP8LMetadata;

 typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
  VP8StatusCode    status_;
-  VP8LDecodeState  action_;
  VP8LDecodeState  state_;
  VP8Io           *io_;

@ -66,6 +67,9 @@ struct VP8LDecoder {
  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.

  VP8LBitReader    br_;
+  int              incremental_;   // if true, incremental decoding is expected
+  VP8LBitReader    saved_br_;      // note: could be local variables too
+  int              saved_last_pixel_;

  int              width_;
  int              height_;
@ -96,8 +100,7 @@ struct ALPHDecoder;  // Defined in dec/alphai.h.
 // Decodes image header for alpha data stored using lossless compression.
 // Returns false in case of error.
 int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size,
-                          uint8_t* const output);
+                          const uint8_t* const data, size_t data_size);

 // Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
 // already decoded in previous call(s), it will resume decoding from where it
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -16,6 +16,7 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
+#include "../utils/utils.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG

 //------------------------------------------------------------------------------
@ -43,14 +44,6 @@
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.

-static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16);
-}
-
-static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
-  return (uint32_t)get_le24(data) | (data[3] << 24);
-}
-
 // Validates the RIFF container (if detected) and skips over it.
 // If a RIFF container is detected, returns:
 //     VP8_STATUS_BITSTREAM_ERROR for invalid header,
@ -70,7 +63,7 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
    } else {
-      const uint32_t size = get_le32(*data + TAG_SIZE);
+      const uint32_t size = GetLE32(*data + TAG_SIZE);
      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
        return VP8_STATUS_BITSTREAM_ERROR;
@ -116,7 +109,7 @@ static VP8StatusCode ParseVP8X(const uint8_t** const data,
  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
    int width, height;
    uint32_t flags;
-    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
+    const uint32_t chunk_size = GetLE32(*data + TAG_SIZE);
    if (chunk_size != VP8X_CHUNK_SIZE) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
    }
@ -125,9 +118,9 @@ static VP8StatusCode ParseVP8X(const uint8_t** const data,
    if (*data_size < vp8x_size) {
      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
    }
-    flags = get_le32(*data + 8);
-    width = 1 + get_le24(*data + 12);
-    height = 1 + get_le24(*data + 15);
+    flags = GetLE32(*data + 8);
+    width = 1 + GetLE24(*data + 12);
+    height = 1 + GetLE24(*data + 15);
    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
    }
@ -181,7 +174,7 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }

-    chunk_size = get_le32(buf + TAG_SIZE);
+    chunk_size = GetLE32(buf + TAG_SIZE);
    if (chunk_size > MAX_CHUNK_PAYLOAD) {
      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }
@ -247,7 +240,7 @@ static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,

  if (is_vp8 || is_vp8l) {
    // Bitstream contains VP8/VP8L header.
-    const uint32_t size = get_le32(data + TAG_SIZE);
+    const uint32_t size = GetLE32(data + TAG_SIZE);
    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
    }
@ -422,7 +415,8 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
 }

 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
-  VP8StatusCode status;
+  // status is marked volatile as a workaround for a clang-3.8 (aarch64) bug
+  volatile VP8StatusCode status;
  int has_animation = 0;
  assert(headers != NULL);
  // fill out headers, ignore width/height/has_alpha.
@ -519,13 +513,13 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,

  if (status != VP8_STATUS_OK) {
    WebPFreeDecBuffer(params->output);
+  } else {
+    if (params->options != NULL && params->options->flip) {
+      // This restores the original stride values if options->flip was used
+      // during the call to WebPAllocateDecBuffer above.
+      status = WebPFlipBuffer(params->output);
+    }
  }
-
-#if WEBP_DECODER_ABI_VERSION > 0x0203
-  if (params->options != NULL && params->options->flip) {
-    status = WebPFlipBuffer(params->output);
-  }
-#endif
  return status;
 }

@ -767,9 +761,24 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
  }

  WebPResetDecParams(&params);
-  params.output = &config->output;
  params.options = &config->options;
-  status = DecodeInto(data, data_size, &params);
+  params.output = &config->output;
+  if (WebPAvoidSlowMemory(params.output, &config->input)) {
+    // decoding to slow memory: use a temporary in-mem buffer to decode into.
+    WebPDecBuffer in_mem_buffer;
+    WebPInitDecBuffer(&in_mem_buffer);
+    in_mem_buffer.colorspace = config->output.colorspace;
+    in_mem_buffer.width = config->input.width;
+    in_mem_buffer.height = config->input.height;
+    params.output = &in_mem_buffer;
+    status = DecodeInto(data, data_size, &params);
+    if (status == VP8_STATUS_OK) {  // do the slow-copy
+      status = WebPCopyDecBufferPixels(&in_mem_buffer, &config->output);
+    }
+    WebPFreeDecBuffer(&in_mem_buffer);
+  } else {
+    status = DecodeInto(data, data_size, &params);
+  }

  return status;
 }
@ -808,15 +817,17 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
  // Scaling
  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
  if (io->use_scaling) {
-    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+    int scaled_width = options->scaled_width;
+    int scaled_height = options->scaled_height;
+    if (!WebPRescalerGetScaledDimensions(w, h, &scaled_width, &scaled_height)) {
      return 0;
    }
-    io->scaled_width = options->scaled_width;
-    io->scaled_height = options->scaled_height;
+    io->scaled_width = scaled_width;
+    io->scaled_height = scaled_height;
  }

  // Filter
-  io->bypass_filtering = options && options->bypass_filtering;
+  io->bypass_filtering = (options != NULL) && options->bypass_filtering;

  // Fancy upsampler
 #ifdef FANCY_UPSAMPLING
@ -833,4 +844,3 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 }

 //------------------------------------------------------------------------------
-
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@ -26,7 +26,10 @@ extern "C" {

 typedef struct WebPDecParams WebPDecParams;
 typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
-typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
+typedef int (*OutputAlphaFunc)(const VP8Io* const io, WebPDecParams* const p,
+                               int expected_num_out_lines);
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos,
+                             int max_out_lines);

 struct WebPDecParams {
  WebPDecBuffer* output;             // output buffer.
@ -40,13 +43,22 @@ struct WebPDecParams {
  void* memory;                  // overall scratch memory for the output work.

  OutputFunc emit;               // output RGB or YUV samples
-  OutputFunc emit_alpha;         // output alpha channel
+  OutputAlphaFunc emit_alpha;    // output alpha channel
  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
+
+  WebPDecBuffer* final_output;   // In case the user supplied a slow-memory
+                                 // output, we decode image in temporary buffer
+                                 // (this::output) and copy it here.
+  WebPDecBuffer tmp_buffer;      // this::output will point to this one in case
+                                 // of slow memory.
 };

 // Should be called first, before any use of the WebPDecParams object.
 void WebPResetDecParams(WebPDecParams* const params);

+// Delete all memory (after an error occurred, for instance)
+void WebPFreeDecParams(WebPDecParams* const params);
+
 //------------------------------------------------------------------------------
 // Header parsing helpers

@ -104,13 +116,23 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
 VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);

 // Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
-// memory (still held by 'src').
+// memory (still held by 'src'). No pixels are copied.
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
                       WebPDecBuffer* const dst);

 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);

+// Copy pixels from 'src' into a *preallocated* 'dst' buffer. Returns
+// VP8_STATUS_INVALID_PARAM if the 'dst' is not set up correctly for the copy.
+VP8StatusCode WebPCopyDecBufferPixels(const WebPDecBuffer* const src,
+                                      WebPDecBuffer* const dst);
+
+// Returns true if decoding will be slow with the current configuration
+// and bitstream features.
+int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
+                        const WebPBitstreamFeatures* const features);
+
 //------------------------------------------------------------------------------

 #ifdef __cplusplus
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -1,7 +1,7 @@
 lib_LTLIBRARIES = libwebpdemux.la

 libwebpdemux_la_SOURCES =
-libwebpdemux_la_SOURCES += demux.c
+libwebpdemux_la_SOURCES += anim_decode.c demux.c

 libwebpdemuxinclude_HEADERS =
 libwebpdemuxinclude_HEADERS += ../webp/demux.h
@ -9,6 +9,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:2:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:0:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/anim_decode.c
+++ b/src/demux/anim_decode.c
@ -0,0 +1,442 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  AnimDecoder implementation.
+//
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include <assert.h>
+#include <string.h>
+
+#include "../utils/utils.h"
+#include "../webp/decode.h"
+#include "../webp/demux.h"
+
+#define NUM_CHANNELS 4
+
+typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
+static void BlendPixelRowNonPremult(uint32_t* const src,
+                                    const uint32_t* const dst, int num_pixels);
+static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
+                                 int num_pixels);
+
+struct WebPAnimDecoder {
+  WebPDemuxer* demux_;             // Demuxer created from given WebP bitstream.
+  WebPDecoderConfig config_;       // Decoder config.
+  // Note: we use a pointer to a function blending multiple pixels at a time to
+  // allow possible inlining of per-pixel blending function.
+  BlendRowFunc blend_func_;        // Pointer to the chose blend row function.
+  WebPAnimInfo info_;              // Global info about the animation.
+  uint8_t* curr_frame_;            // Current canvas (not disposed).
+  uint8_t* prev_frame_disposed_;   // Previous canvas (properly disposed).
+  int prev_frame_timestamp_;       // Previous frame timestamp (milliseconds).
+  WebPIterator prev_iter_;         // Iterator object for previous frame.
+  int prev_frame_was_keyframe_;    // True if previous frame was a keyframe.
+  int next_frame_;                 // Index of the next frame to be decoded
+                                   // (starting from 1).
+};
+
+static void DefaultDecoderOptions(WebPAnimDecoderOptions* const dec_options) {
+  dec_options->color_mode = MODE_RGBA;
+  dec_options->use_threads = 0;
+}
+
+int WebPAnimDecoderOptionsInitInternal(WebPAnimDecoderOptions* dec_options,
+                                       int abi_version) {
+  if (dec_options == NULL ||
+      WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
+    return 0;
+  }
+  DefaultDecoderOptions(dec_options);
+  return 1;
+}
+
+static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
+                               WebPAnimDecoder* const dec) {
+  WEBP_CSP_MODE mode;
+  WebPDecoderConfig* config = &dec->config_;
+  assert(dec_options != NULL);
+
+  mode = dec_options->color_mode;
+  if (mode != MODE_RGBA && mode != MODE_BGRA &&
+      mode != MODE_rgbA && mode != MODE_bgrA) {
+    return 0;
+  }
+  dec->blend_func_ = (mode == MODE_RGBA || mode == MODE_BGRA)
+                         ? &BlendPixelRowNonPremult
+                         : &BlendPixelRowPremult;
+  WebPInitDecoderConfig(config);
+  config->output.colorspace = mode;
+  config->output.is_external_memory = 1;
+  config->options.use_threads = dec_options->use_threads;
+  // Note: config->output.u.RGBA is set at the time of decoding each frame.
+  return 1;
+}
+
+WebPAnimDecoder* WebPAnimDecoderNewInternal(
+    const WebPData* webp_data, const WebPAnimDecoderOptions* dec_options,
+    int abi_version) {
+  WebPAnimDecoderOptions options;
+  WebPAnimDecoder* dec = NULL;
+  if (webp_data == NULL ||
+      WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
+    return NULL;
+  }
+
+  // Note: calloc() so that the pointer members are initialized to NULL.
+  dec = (WebPAnimDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
+  if (dec == NULL) goto Error;
+
+  if (dec_options != NULL) {
+    options = *dec_options;
+  } else {
+    DefaultDecoderOptions(&options);
+  }
+  if (!ApplyDecoderOptions(&options, dec)) goto Error;
+
+  dec->demux_ = WebPDemux(webp_data);
+  if (dec->demux_ == NULL) goto Error;
+
+  dec->info_.canvas_width = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_WIDTH);
+  dec->info_.canvas_height = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_HEIGHT);
+  dec->info_.loop_count = WebPDemuxGetI(dec->demux_, WEBP_FF_LOOP_COUNT);
+  dec->info_.bgcolor = WebPDemuxGetI(dec->demux_, WEBP_FF_BACKGROUND_COLOR);
+  dec->info_.frame_count = WebPDemuxGetI(dec->demux_, WEBP_FF_FRAME_COUNT);
+
+  {
+    const int canvas_bytes =
+        dec->info_.canvas_width * NUM_CHANNELS * dec->info_.canvas_height;
+    // Note: calloc() because we fill frame with zeroes as well.
+    dec->curr_frame_ = WebPSafeCalloc(1ULL, canvas_bytes);
+    if (dec->curr_frame_ == NULL) goto Error;
+    dec->prev_frame_disposed_ = WebPSafeCalloc(1ULL, canvas_bytes);
+    if (dec->prev_frame_disposed_ == NULL) goto Error;
+  }
+
+  WebPAnimDecoderReset(dec);
+
+  return dec;
+
+ Error:
+  WebPAnimDecoderDelete(dec);
+  return NULL;
+}
+
+int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, WebPAnimInfo* info) {
+  if (dec == NULL || info == NULL) return 0;
+  *info = dec->info_;
+  return 1;
+}
+
+// Returns true if the frame covers the full canvas.
+static int IsFullFrame(int width, int height, int canvas_width,
+                       int canvas_height) {
+  return (width == canvas_width && height == canvas_height);
+}
+
+// Clear the canvas to transparent.
+static void ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
+                           uint32_t canvas_height) {
+  memset(buf, 0, canvas_width * NUM_CHANNELS * canvas_height);
+}
+
+// Clear given frame rectangle to transparent.
+static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
+                              int y_offset, int width, int height) {
+  int j;
+  assert(width * NUM_CHANNELS <= buf_stride);
+  buf += y_offset * buf_stride + x_offset * NUM_CHANNELS;
+  for (j = 0; j < height; ++j) {
+    memset(buf, 0, width * NUM_CHANNELS);
+    buf += buf_stride;
+  }
+}
+
+// Copy width * height pixels from 'src' to 'dst'.
+static void CopyCanvas(const uint8_t* src, uint8_t* dst,
+                       uint32_t width, uint32_t height) {
+  assert(src != NULL && dst != NULL);
+  memcpy(dst, src, width * NUM_CHANNELS * height);
+}
+
+// Returns true if the current frame is a key-frame.
+static int IsKeyFrame(const WebPIterator* const curr,
+                      const WebPIterator* const prev,
+                      int prev_frame_was_key_frame,
+                      int canvas_width, int canvas_height) {
+  if (curr->frame_num == 1) {
+    return 1;
+  } else if ((!curr->has_alpha || curr->blend_method == WEBP_MUX_NO_BLEND) &&
+             IsFullFrame(curr->width, curr->height,
+                         canvas_width, canvas_height)) {
+    return 1;
+  } else {
+    return (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) &&
+           (IsFullFrame(prev->width, prev->height, canvas_width,
+                        canvas_height) ||
+            prev_frame_was_key_frame);
+  }
+}
+
+
+// Blend a single channel of 'src' over 'dst', given their alpha channel values.
+// 'src' and 'dst' are assumed to be NOT pre-multiplied by alpha.
+static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
+                                      uint32_t dst, uint8_t dst_a,
+                                      uint32_t scale, int shift) {
+  const uint8_t src_channel = (src >> shift) & 0xff;
+  const uint8_t dst_channel = (dst >> shift) & 0xff;
+  const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
+  assert(blend_unscaled < (1ULL << 32) / scale);
+  return (blend_unscaled * scale) >> 24;
+}
+
+// Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
+static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
+  const uint8_t src_a = (src >> 24) & 0xff;
+
+  if (src_a == 0) {
+    return dst;
+  } else {
+    const uint8_t dst_a = (dst >> 24) & 0xff;
+    // This is the approximate integer arithmetic for the actual formula:
+    // dst_factor_a = (dst_a * (255 - src_a)) / 255.
+    const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
+    const uint8_t blend_a = src_a + dst_factor_a;
+    const uint32_t scale = (1UL << 24) / blend_a;
+
+    const uint8_t blend_r =
+        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
+    const uint8_t blend_g =
+        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
+    const uint8_t blend_b =
+        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    assert(src_a + dst_factor_a < 256);
+
+    return (blend_r << 0) |
+           (blend_g << 8) |
+           (blend_b << 16) |
+           ((uint32_t)blend_a << 24);
+  }
+}
+
+// Blend 'num_pixels' in 'src' over 'dst' assuming they are NOT pre-multiplied
+// by alpha.
+static void BlendPixelRowNonPremult(uint32_t* const src,
+                                    const uint32_t* const dst, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    if (src_alpha != 0xff) {
+      src[i] = BlendPixelNonPremult(src[i], dst[i]);
+    }
+  }
+}
+
+// Individually multiply each channel in 'pix' by 'scale'.
+static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
+  uint32_t mask = 0x00FF00FF;
+  uint32_t rb = ((pix & mask) * scale) >> 8;
+  uint32_t ag = ((pix >> 8) & mask) * scale;
+  return (rb & mask) | (ag & ~mask);
+}
+
+// Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
+static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
+  const uint8_t src_a = (src >> 24) & 0xff;
+  return src + ChannelwiseMultiply(dst, 256 - src_a);
+}
+
+// Blend 'num_pixels' in 'src' over 'dst' assuming they are pre-multiplied by
+// alpha.
+static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
+                                 int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    if (src_alpha != 0xff) {
+      src[i] = BlendPixelPremult(src[i], dst[i]);
+    }
+  }
+}
+
+// Returns two ranges (<left, width> pairs) at row 'canvas_y', that belong to
+// 'src' but not 'dst'. A point range is empty if the corresponding width is 0.
+static void FindBlendRangeAtRow(const WebPIterator* const src,
+                                const WebPIterator* const dst, int canvas_y,
+                                int* const left1, int* const width1,
+                                int* const left2, int* const width2) {
+  const int src_max_x = src->x_offset + src->width;
+  const int dst_max_x = dst->x_offset + dst->width;
+  const int dst_max_y = dst->y_offset + dst->height;
+  assert(canvas_y >= src->y_offset && canvas_y < (src->y_offset + src->height));
+  *left1 = -1;
+  *width1 = 0;
+  *left2 = -1;
+  *width2 = 0;
+
+  if (canvas_y < dst->y_offset || canvas_y >= dst_max_y ||
+      src->x_offset >= dst_max_x || src_max_x <= dst->x_offset) {
+    *left1 = src->x_offset;
+    *width1 = src->width;
+    return;
+  }
+
+  if (src->x_offset < dst->x_offset) {
+    *left1 = src->x_offset;
+    *width1 = dst->x_offset - src->x_offset;
+  }
+
+  if (src_max_x > dst_max_x) {
+    *left2 = dst_max_x;
+    *width2 = src_max_x - dst_max_x;
+  }
+}
+
+int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
+                           uint8_t** buf_ptr, int* timestamp_ptr) {
+  WebPIterator iter;
+  uint32_t width;
+  uint32_t height;
+  int is_key_frame;
+  int timestamp;
+  BlendRowFunc blend_row;
+
+  if (dec == NULL || buf_ptr == NULL || timestamp_ptr == NULL) return 0;
+  if (!WebPAnimDecoderHasMoreFrames(dec)) return 0;
+
+  width = dec->info_.canvas_width;
+  height = dec->info_.canvas_height;
+  blend_row = dec->blend_func_;
+
+  // Get compressed frame.
+  if (!WebPDemuxGetFrame(dec->demux_, dec->next_frame_, &iter)) {
+    return 0;
+  }
+  timestamp = dec->prev_frame_timestamp_ + iter.duration;
+
+  // Initialize.
+  is_key_frame = IsKeyFrame(&iter, &dec->prev_iter_,
+                            dec->prev_frame_was_keyframe_, width, height);
+  if (is_key_frame) {
+    ZeroFillCanvas(dec->curr_frame_, width, height);
+  } else {
+    CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_, width, height);
+  }
+
+  // Decode.
+  {
+    const uint8_t* in = iter.fragment.bytes;
+    const size_t in_size = iter.fragment.size;
+    const size_t out_offset =
+        (iter.y_offset * width + iter.x_offset) * NUM_CHANNELS;
+    WebPDecoderConfig* const config = &dec->config_;
+    WebPRGBABuffer* const buf = &config->output.u.RGBA;
+    buf->stride = NUM_CHANNELS * width;
+    buf->size = buf->stride * iter.height;
+    buf->rgba = dec->curr_frame_ + out_offset;
+
+    if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {
+      goto Error;
+    }
+  }
+
+  // During the decoding of current frame, we may have set some pixels to be
+  // transparent (i.e. alpha < 255). However, the value of each of these
+  // pixels should have been determined by blending it against the value of
+  // that pixel in the previous frame if blending method of is WEBP_MUX_BLEND.
+  if (iter.frame_num > 1 && iter.blend_method == WEBP_MUX_BLEND &&
+      !is_key_frame) {
+    if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_NONE) {
+      int y;
+      // Blend transparent pixels with pixels in previous canvas.
+      for (y = 0; y < iter.height; ++y) {
+        const size_t offset =
+            (iter.y_offset + y) * width + iter.x_offset;
+        blend_row((uint32_t*)dec->curr_frame_ + offset,
+                  (uint32_t*)dec->prev_frame_disposed_ + offset, iter.width);
+      }
+    } else {
+      int y;
+      assert(dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND);
+      // We need to blend a transparent pixel with its value just after
+      // initialization. That is, blend it with:
+      // * Fully transparent pixel if it belongs to prevRect <-- No-op.
+      // * The pixel in the previous canvas otherwise <-- Need alpha-blending.
+      for (y = 0; y < iter.height; ++y) {
+        const int canvas_y = iter.y_offset + y;
+        int left1, width1, left2, width2;
+        FindBlendRangeAtRow(&iter, &dec->prev_iter_, canvas_y, &left1, &width1,
+                            &left2, &width2);
+        if (width1 > 0) {
+          const size_t offset1 = canvas_y * width + left1;
+          blend_row((uint32_t*)dec->curr_frame_ + offset1,
+                    (uint32_t*)dec->prev_frame_disposed_ + offset1, width1);
+        }
+        if (width2 > 0) {
+          const size_t offset2 = canvas_y * width + left2;
+          blend_row((uint32_t*)dec->curr_frame_ + offset2,
+                    (uint32_t*)dec->prev_frame_disposed_ + offset2, width2);
+        }
+      }
+    }
+  }
+
+  // Update info of the previous frame and dispose it for the next iteration.
+  dec->prev_frame_timestamp_ = timestamp;
+  dec->prev_iter_ = iter;
+  dec->prev_frame_was_keyframe_ = is_key_frame;
+  CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
+  if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+    ZeroFillFrameRect(dec->prev_frame_disposed_, width * NUM_CHANNELS,
+                      dec->prev_iter_.x_offset, dec->prev_iter_.y_offset,
+                      dec->prev_iter_.width, dec->prev_iter_.height);
+  }
+  ++dec->next_frame_;
+
+  // All OK, fill in the values.
+  *buf_ptr = dec->curr_frame_;
+  *timestamp_ptr = timestamp;
+  return 1;
+
+ Error:
+  WebPDemuxReleaseIterator(&iter);
+  return 0;
+}
+
+int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec) {
+  if (dec == NULL) return 0;
+  return (dec->next_frame_ <= (int)dec->info_.frame_count);
+}
+
+void WebPAnimDecoderReset(WebPAnimDecoder* dec) {
+  if (dec != NULL) {
+    dec->prev_frame_timestamp_ = 0;
+    memset(&dec->prev_iter_, 0, sizeof(dec->prev_iter_));
+    dec->prev_frame_was_keyframe_ = 0;
+    dec->next_frame_ = 1;
+  }
+}
+
+const WebPDemuxer* WebPAnimDecoderGetDemuxer(const WebPAnimDecoder* dec) {
+  if (dec == NULL) return NULL;
+  return dec->demux_;
+}
+
+void WebPAnimDecoderDelete(WebPAnimDecoder* dec) {
+  if (dec != NULL) {
+    WebPDemuxDelete(dec->demux_);
+    WebPSafeFree(dec->curr_frame_);
+    WebPSafeFree(dec->prev_frame_disposed_);
+    WebPSafeFree(dec);
+  }
+}
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -24,8 +24,8 @@
 #include "../webp/format_constants.h"

 #define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 2
+#define DMUX_MIN_VERSION 3
+#define DMUX_REV_VERSION 0

 typedef struct {
  size_t start_;        // start location of the data
@ -47,8 +47,7 @@ typedef struct Frame {
  int duration_;
  WebPMuxAnimDispose dispose_method_;
  WebPMuxAnimBlend blend_method_;
-  int is_fragment_;  // this is a frame fragment (and not a full frame).
-  int frame_num_;  // the referent frame number for use in assembling fragments.
+  int frame_num_;
  int complete_;   // img_components_ contains a full image.
  ChunkData img_components_[2];  // 0=VP8{,L} 1=ALPH
  struct Frame* next_;
@ -193,6 +192,19 @@ static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
  return 1;
 }

+static void SetFrameInfo(size_t start_offset, size_t size,
+                         int frame_num, int complete,
+                         const WebPBitstreamFeatures* const features,
+                         Frame* const frame) {
+  frame->img_components_[0].offset_ = start_offset;
+  frame->img_components_[0].size_ = size;
+  frame->width_ = features->width;
+  frame->height_ = features->height;
+  frame->has_alpha_ |= features->has_alpha;
+  frame->frame_num_ = frame_num;
+  frame->complete_ = complete;
+}
+
 // Store image bearing chunks to 'frame'.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
                              MemBuffer* const mem, Frame* const frame) {
@ -248,13 +260,8 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
            return PARSE_ERROR;
          }
          ++image_chunks;
-          frame->img_components_[0].offset_ = chunk_start_offset;
-          frame->img_components_[0].size_ = chunk_size;
-          frame->width_ = features.width;
-          frame->height_ = features.height;
-          frame->has_alpha_ |= features.has_alpha;
-          frame->frame_num_ = frame_num;
-          frame->complete_ = (status == PARSE_OK);
+          SetFrameInfo(chunk_start_offset, chunk_size, frame_num,
+                       status == PARSE_OK, &features, frame);
          Skip(mem, payload_available);
        } else {
          goto Done;
@ -337,42 +344,6 @@ static ParseStatus ParseAnimationFrame(
  return status;
 }

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-// Parse a 'FRGM' chunk and any image bearing chunks that immediately follow.
-// 'fragment_chunk_size' is the previously validated, padded chunk size.
-static ParseStatus ParseFragment(WebPDemuxer* const dmux,
-                                 uint32_t fragment_chunk_size) {
-  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
-  int added_fragment = 0;
-  MemBuffer* const mem = &dmux->mem_;
-  Frame* frame;
-  ParseStatus status =
-      NewFrame(mem, FRGM_CHUNK_SIZE, fragment_chunk_size, &frame);
-  if (status != PARSE_OK) return status;
-
-  frame->is_fragment_ = 1;
-  frame->x_offset_ = 2 * ReadLE24s(mem);
-  frame->y_offset_ = 2 * ReadLE24s(mem);
-
-  // Store a fragment only if the 'fragments' flag is set and there is some
-  // data available.
-  status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
-  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
-    added_fragment = AddFrame(dmux, frame);
-    if (!added_fragment) {
-      status = PARSE_ERROR;
-    } else {
-      dmux->num_frames_ = 1;
-    }
-  }
-
-  if (!added_fragment) WebPSafeFree(frame);
-  return status;
-}
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
 // General chunk storage, starting with the header at 'start_offset', allowing
 // the user to request the payload via a fourcc string. 'size' includes the
 // header and the unpadded payload size.
@ -513,12 +484,6 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
        status = ParseAnimationFrame(dmux, chunk_size_padded);
        break;
      }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      case MKFOURCC('F', 'R', 'G', 'M'): {
-        status = ParseFragment(dmux, chunk_size_padded);
-        break;
-      }
-#endif
      case MKFOURCC('I', 'C', 'C', 'P'): {
        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
        goto Skip;
@ -606,8 +571,6 @@ static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {

 // If 'exact' is true, check that the image resolution matches the canvas.
 // If 'exact' is false, check that the x/y offsets do not exceed the canvas.
-// TODO(jzern): this is insufficient in the fragmented image case if the
-// expectation is that the fragments completely cover the canvas.
 static int CheckFrameBounds(const Frame* const frame, int exact,
                            int canvas_width, int canvas_height) {
  if (exact) {
@ -635,22 +598,17 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
  if (dmux->loop_count_ < 0) return 0;
  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
-#ifndef WEBP_EXPERIMENTAL_FEATURES
  if (is_fragmented) return 0;
-#endif

  while (f != NULL) {
    const int cur_frame_set = f->frame_num_;
-    int frame_count = 0, fragment_count = 0;
+    int frame_count = 0;

-    // Check frame properties and if the image is composed of fragments that
-    // each fragment came from a fragment.
+    // Check frame properties.
    for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
      const ChunkData* const image = f->img_components_;
      const ChunkData* const alpha = f->img_components_ + 1;

-      if (is_fragmented && !f->is_fragment_) return 0;
-      if (!is_fragmented && f->is_fragment_) return 0;
      if (!is_animation && f->frame_num_ > 1) return 0;

      if (f->complete_) {
@ -675,16 +633,13 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
      }

      if (f->width_ > 0 && f->height_ > 0 &&
-          !CheckFrameBounds(f, !(is_animation || is_fragmented),
+          !CheckFrameBounds(f, !is_animation,
                            dmux->canvas_width_, dmux->canvas_height_)) {
        return 0;
      }

-      fragment_count += f->is_fragment_;
      ++frame_count;
    }
-    if (!is_fragmented && frame_count > 1) return 0;
-    if (fragment_count > 0 && frame_count != fragment_count) return 0;
  }
  return 1;
 }
@ -703,6 +658,41 @@ static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
  dmux->mem_ = *mem;
 }

+static ParseStatus CreateRawImageDemuxer(MemBuffer* const mem,
+                                         WebPDemuxer** demuxer) {
+  WebPBitstreamFeatures features;
+  const VP8StatusCode status =
+      WebPGetFeatures(mem->buf_, mem->buf_size_, &features);
+  *demuxer = NULL;
+  if (status != VP8_STATUS_OK) {
+    return (status == VP8_STATUS_NOT_ENOUGH_DATA) ? PARSE_NEED_MORE_DATA
+                                                  : PARSE_ERROR;
+  }
+
+  {
+    WebPDemuxer* const dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
+    Frame* const frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
+    if (dmux == NULL || frame == NULL) goto Error;
+    InitDemux(dmux, mem);
+    SetFrameInfo(0, mem->buf_size_, 1 /*frame_num*/, 1 /*complete*/, &features,
+                 frame);
+    if (!AddFrame(dmux, frame)) goto Error;
+    dmux->state_ = WEBP_DEMUX_DONE;
+    dmux->canvas_width_ = frame->width_;
+    dmux->canvas_height_ = frame->height_;
+    dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
+    dmux->num_frames_ = 1;
+    assert(IsValidSimpleFormat(dmux));
+    *demuxer = dmux;
+    return PARSE_OK;
+
+ Error:
+    WebPSafeFree(dmux);
+    WebPSafeFree(frame);
+    return PARSE_ERROR;
+  }
+}
+
 WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
                               WebPDemuxState* state, int version) {
  const ChunkParser* parser;
@ -719,6 +709,15 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
  status = ReadHeader(&mem);
  if (status != PARSE_OK) {
+    // If parsing of the webp file header fails attempt to handle a raw
+    // VP8/VP8L frame. Note 'allow_partial' is ignored in this case.
+    if (status == PARSE_ERROR) {
+      status = CreateRawImageDemuxer(&mem, &dmux);
+      if (status == PARSE_OK) {
+        if (state != NULL) *state = WEBP_DEMUX_DONE;
+        return dmux;
+      }
+    }
    if (state != NULL) {
      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
                                                : WEBP_DEMUX_PARSE_ERROR;
@ -790,8 +789,6 @@ uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
 // -----------------------------------------------------------------------------
 // Frame iteration

-// Find the first 'frame_num' frame. There may be multiple such frames in a
-// fragmented frame.
 static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
  const Frame* f;
  for (f = dmux->frames_; f != NULL; f = f->next_) {
@ -800,21 +797,6 @@ static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
  return f;
 }

-// Returns fragment 'fragment_num' and the total count.
-static const Frame* GetFragment(
-    const Frame* const frame_set, int fragment_num, int* const count) {
-  const int this_frame = frame_set->frame_num_;
-  const Frame* f = frame_set;
-  const Frame* fragment = NULL;
-  int total;
-
-  for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
-    if (++total == fragment_num) fragment = f;
-  }
-  *count = total;
-  return fragment;
-}
-
 static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
                                      const Frame* const frame,
                                      size_t* const data_size) {
@ -841,34 +823,27 @@ static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,

 // Create a whole 'frame' from VP8 (+ alpha) or lossless.
 static int SynthesizeFrame(const WebPDemuxer* const dmux,
-                           const Frame* const first_frame,
-                           int fragment_num, WebPIterator* const iter) {
+                           const Frame* const frame,
+                           WebPIterator* const iter) {
  const uint8_t* const mem_buf = dmux->mem_.buf_;
-  int num_fragments;
  size_t payload_size = 0;
-  const Frame* const fragment =
-      GetFragment(first_frame, fragment_num, &num_fragments);
-  const uint8_t* const payload =
-      GetFramePayload(mem_buf, fragment, &payload_size);
+  const uint8_t* const payload = GetFramePayload(mem_buf, frame, &payload_size);
  if (payload == NULL) return 0;
-  assert(first_frame != NULL);
+  assert(frame != NULL);

-  iter->frame_num      = first_frame->frame_num_;
+  iter->frame_num      = frame->frame_num_;
  iter->num_frames     = dmux->num_frames_;
-  iter->fragment_num   = fragment_num;
-  iter->num_fragments  = num_fragments;
-  iter->x_offset       = fragment->x_offset_;
-  iter->y_offset       = fragment->y_offset_;
-  iter->width          = fragment->width_;
-  iter->height         = fragment->height_;
-  iter->has_alpha      = fragment->has_alpha_;
-  iter->duration       = fragment->duration_;
-  iter->dispose_method = fragment->dispose_method_;
-  iter->blend_method   = fragment->blend_method_;
-  iter->complete       = fragment->complete_;
+  iter->x_offset       = frame->x_offset_;
+  iter->y_offset       = frame->y_offset_;
+  iter->width          = frame->width_;
+  iter->height         = frame->height_;
+  iter->has_alpha      = frame->has_alpha_;
+  iter->duration       = frame->duration_;
+  iter->dispose_method = frame->dispose_method_;
+  iter->blend_method   = frame->blend_method_;
+  iter->complete       = frame->complete_;
  iter->fragment.bytes = payload;
  iter->fragment.size  = payload_size;
-  // TODO(jzern): adjust offsets for 'FRGM's embedded in 'ANMF's
  return 1;
 }

@ -882,7 +857,7 @@ static int SetFrame(int frame_num, WebPIterator* const iter) {
  frame = GetFrame(dmux, frame_num);
  if (frame == NULL) return 0;

-  return SynthesizeFrame(dmux, frame, 1, iter);
+  return SynthesizeFrame(dmux, frame, iter);
 }

 int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
@ -904,17 +879,6 @@ int WebPDemuxPrevFrame(WebPIterator* iter) {
  return SetFrame(iter->frame_num - 1, iter);
 }

-int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
-  if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
-    const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
-    const Frame* const frame = GetFrame(dmux, iter->frame_num);
-    if (frame == NULL) return 0;
-
-    return SynthesizeFrame(dmux, frame, fragment_num, iter);
-  }
-  return 0;
-}
-
 void WebPDemuxReleaseIterator(WebPIterator* iter) {
  (void)iter;
 }
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -1,5 +1,8 @@
 noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
 noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
+noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
+noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
+noinst_LTLIBRARIES += libwebpdspdecode_msa.la

 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@ -10,63 +13,127 @@ commondir = $(includedir)/webp

 COMMON_SOURCES =
 COMMON_SOURCES += alpha_processing.c
+COMMON_SOURCES += alpha_processing_mips_dsp_r2.c
+COMMON_SOURCES += common_sse2.h
 COMMON_SOURCES += cpu.c
 COMMON_SOURCES += dec.c
 COMMON_SOURCES += dec_clip_tables.c
 COMMON_SOURCES += dec_mips32.c
-COMMON_SOURCES += dec_neon.c
+COMMON_SOURCES += dec_mips_dsp_r2.c
 COMMON_SOURCES += dsp.h
+COMMON_SOURCES += filters.c
+COMMON_SOURCES += filters_mips_dsp_r2.c
 COMMON_SOURCES += lossless.c
 COMMON_SOURCES += lossless.h
-COMMON_SOURCES += lossless_mips32.c
-COMMON_SOURCES += lossless_neon.c
-COMMON_SOURCES += neon.h
+COMMON_SOURCES += lossless_mips_dsp_r2.c
+COMMON_SOURCES += mips_macro.h
+COMMON_SOURCES += rescaler.c
+COMMON_SOURCES += rescaler_mips32.c
+COMMON_SOURCES += rescaler_mips_dsp_r2.c
 COMMON_SOURCES += upsampling.c
-COMMON_SOURCES += upsampling_neon.c
+COMMON_SOURCES += upsampling_mips_dsp_r2.c
 COMMON_SOURCES += yuv.c
 COMMON_SOURCES += yuv.h
 COMMON_SOURCES += yuv_mips32.c
+COMMON_SOURCES += yuv_mips_dsp_r2.c

 ENC_SOURCES =
+ENC_SOURCES += argb.c
+ENC_SOURCES += argb_mips_dsp_r2.c
+ENC_SOURCES += cost.c
+ENC_SOURCES += cost_mips32.c
+ENC_SOURCES += cost_mips_dsp_r2.c
 ENC_SOURCES += enc.c
 ENC_SOURCES += enc_mips32.c
-ENC_SOURCES += enc_neon.c
+ENC_SOURCES += enc_mips_dsp_r2.c
+ENC_SOURCES += lossless_enc.c
+ENC_SOURCES += lossless_enc_mips32.c
+ENC_SOURCES += lossless_enc_mips_dsp_r2.c

 libwebpdsp_avx2_la_SOURCES =
 libwebpdsp_avx2_la_SOURCES += enc_avx2.c
 libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)

+libwebpdspdecode_sse41_la_SOURCES =
+libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c
+libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c
+libwebpdspdecode_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdspdecode_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
+
 libwebpdspdecode_sse2_la_SOURCES =
 libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += filters_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += lossless_sse2.c
+libwebpdspdecode_sse2_la_SOURCES += rescaler_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += upsampling_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += yuv_sse2.c
-libwebpdspdecode_sse2_la_SOURCES += yuv_tables_sse2.h
 libwebpdspdecode_sse2_la_CPPFLAGS = $(libwebpdsp_sse2_la_CPPFLAGS)
 libwebpdspdecode_sse2_la_CFLAGS = $(libwebpdsp_sse2_la_CFLAGS)

+libwebpdspdecode_neon_la_SOURCES =
+libwebpdspdecode_neon_la_SOURCES += dec_neon.c
+libwebpdspdecode_neon_la_SOURCES += lossless_neon.c
+libwebpdspdecode_neon_la_SOURCES += neon.h
+libwebpdspdecode_neon_la_SOURCES += rescaler_neon.c
+libwebpdspdecode_neon_la_SOURCES += upsampling_neon.c
+libwebpdspdecode_neon_la_CPPFLAGS = $(libwebpdsp_neon_la_CPPFLAGS)
+libwebpdspdecode_neon_la_CFLAGS = $(libwebpdsp_neon_la_CFLAGS)
+
+libwebpdspdecode_msa_la_SOURCES =
+libwebpdspdecode_msa_la_SOURCES += dec_msa.c
+libwebpdspdecode_msa_la_SOURCES += msa_macro.h
+libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdspdecode_msa_la_CFLAGS = $(AM_CFLAGS)
+
 libwebpdsp_sse2_la_SOURCES =
+libwebpdsp_sse2_la_SOURCES += argb_sse2.c
+libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
+libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
 libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
 libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la

+libwebpdsp_sse41_la_SOURCES =
+libwebpdsp_sse41_la_SOURCES += enc_sse41.c
+libwebpdsp_sse41_la_SOURCES += lossless_enc_sse41.c
+libwebpdsp_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
+libwebpdsp_sse41_la_LIBADD = libwebpdspdecode_sse41.la
+
+libwebpdsp_neon_la_SOURCES =
+libwebpdsp_neon_la_SOURCES += enc_neon.c
+libwebpdsp_neon_la_SOURCES += lossless_enc_neon.c
+libwebpdsp_neon_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
+libwebpdsp_neon_la_LIBADD = libwebpdspdecode_neon.la
+
 libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)

 noinst_HEADERS =
 noinst_HEADERS += ../dec/decode_vp8.h
 noinst_HEADERS += ../webp/decode.h

-libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
+libwebpdsp_la_CPPFLAGS =
+libwebpdsp_la_CPPFLAGS += $(AM_CPPFLAGS)
+libwebpdsp_la_CPPFLAGS += $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
 libwebpdsp_la_LDFLAGS = -lm
-libwebpdsp_la_LIBADD = libwebpdsp_avx2.la libwebpdsp_sse2.la
+libwebpdsp_la_LIBADD =
+libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
+libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
+libwebpdsp_la_LIBADD += libwebpdsp_neon.la
+libwebpdsp_la_LIBADD += libwebpdspdecode_msa.la

 if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)

  libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
-  libwebpdspdecode_la_LIBADD = libwebpdspdecode_sse2.la
+  libwebpdspdecode_la_LIBADD =
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_neon.la
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_msa.la
 endif
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -134,7 +134,7 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {

 #endif    // USE_TABLES_FOR_ALPHA_MULT

-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t argb = ptr[x];
@ -154,8 +154,8 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  }
 }

-static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                    int width, int inverse) {
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t a = alpha[x];
@ -284,6 +284,38 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }

+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  return (alpha_mask != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      dst[i] = alpha[i] << 8;  // leave A/R/B channels zero'd.
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
@ -304,23 +336,29 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,

 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);

 //------------------------------------------------------------------------------
 // Init function

+extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
+extern void WebPInitAlphaProcessingSSE41(void);

 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;

-void WebPInitAlphaProcessing(void) {
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;

-  WebPMultARGBRow = MultARGBRow;
-  WebPMultRow = MultRow;
+  WebPMultARGBRow = WebPMultARGBRowC;
+  WebPMultRow = WebPMultRowC;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
  WebPExtractAlpha = ExtractAlpha;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@ -328,6 +366,16 @@ void WebPInitAlphaProcessing(void) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitAlphaProcessingSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        WebPInitAlphaProcessingSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitAlphaProcessingMIPSdspR2();
    }
 #endif
  }
--- a/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/src/dsp/alpha_processing_mips_dsp_r2.c
@ -0,0 +1,141 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffff;
+  int i, j, temp0;
+
+  for (j = 0; j < height; ++j) {
+    uint8_t* pdst = dst;
+    const uint8_t* palpha = alpha;
+    for (i = 0; i < (width >> 2); ++i) {
+      int temp1, temp2, temp3;
+
+      __asm__ volatile (
+        "ulw    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     4            \n\t"
+        "addiu  %[pdst],       %[pdst],       16           \n\t"
+        "srl    %[temp1],      %[temp0],      8            \n\t"
+        "srl    %[temp2],      %[temp0],      16           \n\t"
+        "srl    %[temp3],      %[temp0],      24           \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "sb     %[temp0],      -16(%[pdst])                \n\t"
+        "sb     %[temp1],      -12(%[pdst])                \n\t"
+        "sb     %[temp2],      -8(%[pdst])                 \n\t"
+        "sb     %[temp3],      -4(%[pdst])                 \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+          [temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+
+    for (i = 0; i < (width & 3); ++i) {
+      __asm__ volatile (
+        "lbu    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     1            \n\t"
+        "sb     %[temp0],      0(%[pdst])                  \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "addiu  %[pdst],       %[pdst],       4            \n\t"
+        : [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  __asm__ volatile (
+    "ext    %[temp0],      %[alpha_mask], 0, 16            \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 16               \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    "ext    %[temp0],      %[alpha_mask], 0, 8             \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 8                \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    : [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
+    :
+  );
+
+  return (alpha_mask != 0xff);
+}
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x;
+  const uint32_t c_00ffffff = 0x00ffffffu;
+  const uint32_t c_ff000000 = 0xff000000u;
+  const uint32_t c_8000000  = 0x00800000u;
+  const uint32_t c_8000080  = 0x00800080u;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        int temp0, temp1, temp2, temp3, alpha;
+        __asm__ volatile (
+          "srl          %[alpha],   %[argb],       24                \n\t"
+          "replv.qb     %[temp0],   %[alpha]                         \n\t"
+          "and          %[temp0],   %[temp0],      %[c_00ffffff]     \n\t"
+          "beqz         %[inverse], 0f                               \n\t"
+          "divu         $zero,      %[c_ff000000], %[alpha]          \n\t"
+          "mflo         %[temp0]                                     \n\t"
+        "0:                                                          \n\t"
+          "andi         %[temp1],   %[argb],       0xff              \n\t"
+          "ext          %[temp2],   %[argb],       8,             8  \n\t"
+          "ext          %[temp3],   %[argb],       16,            8  \n\t"
+          "mul          %[temp1],   %[temp1],      %[temp0]          \n\t"
+          "mul          %[temp2],   %[temp2],      %[temp0]          \n\t"
+          "mul          %[temp3],   %[temp3],      %[temp0]          \n\t"
+          "precrq.ph.w  %[temp1],   %[temp2],      %[temp1]          \n\t"
+          "addu         %[temp3],   %[temp3],      %[c_8000000]      \n\t"
+          "addu         %[temp1],   %[temp1],      %[c_8000080]      \n\t"
+          "precrq.ph.w  %[temp3],   %[argb],       %[temp3]          \n\t"
+          "precrq.qb.ph %[temp1],   %[temp3],      %[temp1]          \n\t"
+          : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+            [temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
+          : [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
+            [c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
+            [c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
+          : "memory", "hi", "lo"
+        );
+        ptr[x] = temp1;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPMultARGBRow = MultARGBRow;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -18,6 +18,86 @@

 //------------------------------------------------------------------------------

+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    __m128i* out = (__m128i*)dst;
+    for (i = 0; i < limit; i += 8) {
+      // load 8 alpha bytes
+      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
+      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
+      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
+      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
+      // load 8 dst pixels (32 bytes)
+      const __m128i b0_lo = _mm_loadu_si128(out + 0);
+      const __m128i b0_hi = _mm_loadu_si128(out + 1);
+      // mask dst alpha values
+      const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
+      const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
+      // combine
+      const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
+      const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
+      // store
+      _mm_storeu_si128(out + 0, b2_lo);
+      _mm_storeu_si128(out + 1, b2_hi);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, a0);
+      out += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  const __m128i zero = _mm_setzero_si128();
+  const int limit = width & ~15;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < limit; i += 16) {   // process 16 alpha bytes
+      const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
+      const __m128i a1 = _mm_unpacklo_epi8(zero, a0);  // note the 'zero' first!
+      const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
+      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
+      const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
+      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
+      const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], a2_lo);
+      _mm_storeu_si128((__m128i*)&dst[i +  4], a2_hi);
+      _mm_storeu_si128((__m128i*)&dst[i +  8], b2_lo);
+      _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
+    }
+    for (; i < width; ++i) dst[i] = alpha[i] << 8;
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
@ -63,15 +143,156 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
  return (alpha_and == 0xff);
 }

-#endif   // WEBP_USE_SSE2
+//------------------------------------------------------------------------------
+// Non-dither premultiplied modes
+
+#define MULTIPLIER(a)   ((a) * 0x8081)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+
+// We can't use a 'const int' for the SHUFFLE value, because it has to be an
+// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.
+#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do {             \
+  const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX));     \
+  const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);         \
+  const __m128i alpha0 = _mm_and_si128(argb1, MASK);            \
+  const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE);  \
+  const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE);  \
+  /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */                       \
+  const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT);         \
+  const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT);         \
+  const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);         \
+  const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);         \
+  const __m128i argb4 = _mm_adds_epu16(argb2, argb3);           \
+  const __m128i argb5 = _mm_srli_epi16(argb4, 7);               \
+  const __m128i argb6 = _mm_or_si128(argb5, alpha0);            \
+  const __m128i argb7 = _mm_packus_epi16(argb6, zero);          \
+  _mm_storel_epi64((__m128i*)&(RGBX), argb7);                   \
+} while (0)
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const int kSpan = 2;
+  const int w2 = w & ~(kSpan - 1);
+  while (h-- > 0) {
+    uint32_t* const rgbx = (uint32_t*)rgba;
+    int i;
+    if (!alpha_first) {
+      const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
+      const __m128i kMult =
+          _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
+      for (i = 0; i < w2; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
+      }
+    } else {
+      const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
+      const __m128i kMult =
+          _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
+      for (i = 0; i < w2; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
+      }
+    }
+    // Finish with left-overs.
+    for (; i < w; ++i) {
+      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// -----------------------------------------------------------------------------
+// Apply alpha value to rows
+
+// We use: kINV255 = (1 << 24) / 255 = 0x010101
+// So: a * kINV255 = (a << 16) | [(a << 8) | a]
+// -> _mm_mulhi_epu16() takes care of the (a<<16) part,
+// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x = 0;
+  if (!inverse) {
+    const int kSpan = 2;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kRound =
+        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
+    const __m128i kMult =
+        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
+    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
+    const int w2 = width & ~(kSpan - 1);
+    for (x = 0; x < w2; x += kSpan) {
+      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
+      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
+      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
+      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
+      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
+      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
+      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
+      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
+      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
+      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
+      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
+      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
+    }
+  }
+  width -= x;
+  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+}
+
+static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse) {
+  int x = 0;
+  if (!inverse) {
+    const int kSpan = 8;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kRound = _mm_set1_epi16(1 << 7);
+    const int w2 = width & ~(kSpan - 1);
+    for (x = 0; x < w2; x += kSpan) {
+      const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
+      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
+      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
+      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
+      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
+      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
+      const __m128i v4 = _mm_adds_epu16(v2, v3);
+      const __m128i v5 = _mm_adds_epu16(v4, kRound);
+      const __m128i v6 = _mm_srli_epi16(v5, 8);
+      const __m128i v7 = _mm_packus_epi16(v6, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], v7);
+    }
+  }
+  width -= x;
+  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+}

 //------------------------------------------------------------------------------
-// Init function
+// Entry point

 extern void WebPInitAlphaProcessingSSE2(void);

-void WebPInitAlphaProcessingSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
+  WebPMultARGBRow = MultARGBRow;
+  WebPMultRow = MultRow;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
  WebPExtractAlpha = ExtractAlpha;
-#endif
 }
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/alpha_processing_sse41.c
+++ b/src/dsp/alpha_processing_sse41.c
@ -0,0 +1,92 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel, SSE4.1 variant.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <smmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i all_0xff = _mm_set1_epi32(~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~15;
+  const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                          -1, -1, -1, -1, 12, 8, 4, 0);
+  const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                          12, 8, 4, 0, -1, -1, -1, -1);
+  const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 16) {
+      // load 64 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i a2 = _mm_loadu_si128(src + 2);
+      const __m128i a3 = _mm_loadu_si128(src + 3);
+      const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0);
+      const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1);
+      const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2);
+      const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3);
+      const __m128i c0 = _mm_or_si128(b0, b1);
+      const __m128i c1 = _mm_or_si128(b2, b3);
+      const __m128i d0 = _mm_or_si128(c0, c1);
+      // store
+      _mm_storeu_si128((__m128i*)&alpha[i], d0);
+      // accumulate sixteen alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 4;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the sixteen alpha 'and' into an 8-bit mask.
+  alpha_and |= 0xff00u;  // pretend the upper bits [8..15] were tested ok.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xffffu);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
+  WebPExtractAlpha = ExtractAlpha;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/src/dsp/argb.c
+++ b/src/dsp/argb.c
@ -0,0 +1,68 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions.
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
+
+void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                    const uint8_t*, int, uint32_t*);
+void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                   int, int, uint32_t*);
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+extern void VP8EncDspARGBInitSSE2(void);
+
+static volatile VP8CPUInfo argb_last_cpuinfo_used =
+    (VP8CPUInfo)&argb_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
+  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspARGBInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspARGBInitMIPSdspR2();
+    }
+#endif
+  }
+  argb_last_cpuinfo_used = VP8GetCPUInfo;
+}
--- a/src/dsp/argb_mips_dsp_r2.c
+++ b/src/dsp/argb_mips_dsp_r2.c
@ -0,0 +1,110 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (mips version).
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/argb_sse2.c
+++ b/src/dsp/argb_sse2.c
@ -0,0 +1,67 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (SSE2 version).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
+    int i = 0;
+    const int len_max = len & ~3;  // max length processed in main loop
+    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
+    assert(b == r + 2);
+    assert(a == r + 3);
+    for (; i < len_max; i += 4) {
+      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
+      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
+      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
+      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i F = _mm_or_si128(E, C);
+      _mm_storeu_si128((__m128i*)(out + i), F);
+    }
+    for (; i < len; ++i) {
+      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+    }
+  } else {
+    assert(g == b + 1);
+    assert(r == b + 2);
+    assert(a == b + 3);
+    memcpy(out, b, len * 4);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
+  VP8PackARGB = PackARGB;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/common_sse2.h
+++ b/src/dsp/common_sse2.h
@ -0,0 +1,109 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 code common to several files.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_COMMON_SSE2_H_
+#define WEBP_DSP_COMMON_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE2)
+
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+// Quite useful macro for debugging. Left here for convenience.
+
+#if 0
+#include <stdio.h>
+static WEBP_INLINE void PrintReg(const __m128i r, const char* const name,
+                                 int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  fprintf(stderr, "%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
+  }
+  fprintf(stderr, "\n");
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Math functions.
+
+// Return the sum of all the 8b in the register.
+static WEBP_INLINE int VP8HorizontalAdd8b(const __m128i* const a) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sad8x2 = _mm_sad_epu8(*a, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi32(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  return _mm_cvtsi128_si32(sum);
+}
+
+// Transpose two 4x4 16b matrices horizontally stored in registers.
+static WEBP_INLINE void VP8Transpose_2_4x4_16b(
+    const __m128i* const in0, const __m128i* const in1,
+    const __m128i* const in2, const __m128i* const in3, __m128i* const out0,
+    __m128i* const out1, __m128i* const out2, __m128i* const out3) {
+  // Transpose the two 4x4.
+  // a00 a01 a02 a03   b00 b01 b02 b03
+  // a10 a11 a12 a13   b10 b11 b12 b13
+  // a20 a21 a22 a23   b20 b21 b22 b23
+  // a30 a31 a32 a33   b30 b31 b32 b33
+  const __m128i transpose0_0 = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i transpose0_1 = _mm_unpacklo_epi16(*in2, *in3);
+  const __m128i transpose0_2 = _mm_unpackhi_epi16(*in0, *in1);
+  const __m128i transpose0_3 = _mm_unpackhi_epi16(*in2, *in3);
+  // a00 a10 a01 a11   a02 a12 a03 a13
+  // a20 a30 a21 a31   a22 a32 a23 a33
+  // b00 b10 b01 b11   b02 b12 b03 b13
+  // b20 b30 b21 b31   b22 b32 b23 b33
+  const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+  const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+  const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+  const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+  // a00 a10 a20 a30 a01 a11 a21 a31
+  // b00 b10 b20 b30 b01 b11 b21 b31
+  // a02 a12 a22 a32 a03 a13 a23 a33
+  // b02 b12 a22 b32 b03 b13 b23 b33
+  *out0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+  *out1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+  *out2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+  *out3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+  // a00 a10 a20 a30   b00 b10 b20 b30
+  // a01 a11 a21 a31   b01 b11 b21 b31
+  // a02 a12 a22 a32   b02 b12 b22 b32
+  // a03 a13 a23 a33   b03 b13 b23 b33
+}
+
+#endif  // WEBP_USE_SSE2
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_DSP_COMMON_SSE2_H_
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@ -0,0 +1,412 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+#include "../enc/cost.h"
+
+//------------------------------------------------------------------------------
+// Boolean-cost cost table
+
+const uint16_t VP8EntropyCost[256] = {
+  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
+  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
+   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
+   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
+   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
+   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
+   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
+   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
+   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
+   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
+   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
+   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
+   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
+   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
+   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
+   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
+   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
+   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
+   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
+   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
+    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
+    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
+    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
+    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
+    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
+    10,    9,    7,    6,    4,    3
+};
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// fixed costs for coding levels, deduce from the coding tree.
+// This is only the part that doesn't depend on the probability state.
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+     0,  256,  256,  256,  256,  432,  618,  630,
+   731,  640,  640,  828,  901,  948, 1021, 1101,
+  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
+  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
+  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
+  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
+  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
+  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
+  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
+  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
+  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
+  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
+  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
+  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
+  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
+  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
+  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
+  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
+  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
+  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
+  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
+  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
+  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
+  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
+  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
+  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
+  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
+  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
+  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
+  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
+  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
+  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
+  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
+  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
+  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
+  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
+  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
+  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
+  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
+  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
+  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
+  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
+  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
+  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
+  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
+  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
+  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
+  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
+  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
+  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
+  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
+  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
+  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
+  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
+  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
+  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
+  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
+  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
+  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
+  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
+  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
+  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
+  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
+  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
+  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
+  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
+  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
+  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
+  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
+  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
+  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
+  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
+  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
+  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
+  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
+  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
+  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
+  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
+  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
+  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
+  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
+  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
+  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
+  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
+  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
+  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
+  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
+  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
+  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
+  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
+  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
+  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
+  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
+  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
+  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
+  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
+  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
+  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
+  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
+  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
+  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
+  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
+  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
+  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
+  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
+  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
+  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
+  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
+  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
+  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
+  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
+  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
+  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
+  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
+  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
+  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
+  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
+  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
+  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
+  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
+  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
+  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
+  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
+  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
+  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
+  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
+  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
+  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
+};
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8EncBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // sentinel
+};
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+VP8GetResidualCostFunc VP8GetResidualCost;
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8EncDspCostInitMIPS32(void);
+extern void VP8EncDspCostInitMIPSdspR2(void);
+extern void VP8EncDspCostInitSSE2(void);
+
+static volatile VP8CPUInfo cost_last_cpuinfo_used =
+    (VP8CPUInfo)&cost_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
+  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspCostInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspCostInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspCostInitSSE2();
+    }
+#endif
+  }
+
+  cost_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@ -0,0 +1,154 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/cost.h"
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                        \n\t"
+    ".set      noreorder                                                   \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                   \n\t"
+    "sll       %[temp0],        %[n],               1                      \n\t"
+    "blez      %[temp1],        2f                                         \n\t"
+    " addu     %[res_coeffs],   %[res_coeffs],      %[temp0]               \n\t"
+  "1:                                                                      \n\t"
+    "lh        %[v_reg],        0(%[res_coeffs])                           \n\t"
+    "addiu     %[n],            %[n],               1                      \n\t"
+    "negu      %[temp0],        %[v_reg]                                   \n\t"
+    "slti      %[temp1],        %[v_reg],           0                      \n\t"
+    "movn      %[v_reg],        %[temp0],           %[temp1]               \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                      \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                   \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]               \n\t"
+    "sll       %[temp1],        %[v_reg],           1                      \n\t"
+    "addu      %[temp1],        %[temp1],           %[VP8LevelFixedCosts]  \n\t"
+    "lhu       %[temp1],        0(%[temp1])                                \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]     \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]               \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]               \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                      \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                      \n\t"
+    "addu      %[v_reg],        %[v_reg],           %[t]                   \n\t"
+    "lhu       %[temp0],        0(%[v_reg])                                \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]         \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]             \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]               \n\t"
+    "addiu     %[res_coeffs],   %[res_coeffs],      2                      \n\t"
+    "bne       %[n],            %[res_last],        1b                     \n\t"
+    " lw       %[t],            0(%[t])                                    \n\t"
+  "2:                                                                      \n\t"
+    ".set      pop                                                         \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  const int16_t* p_coeffs = (int16_t*)coeffs;
+  int temp0, temp1, temp2, n, n1;
+  assert(res->first == 0 || coeffs[0] == 0);
+
+  __asm__ volatile (
+    ".set     push                                      \n\t"
+    ".set     noreorder                                 \n\t"
+    "addiu    %[p_coeffs],   %[p_coeffs],    28         \n\t"
+    "li       %[n],          15                         \n\t"
+    "li       %[temp2],      -1                         \n\t"
+  "0:                                                   \n\t"
+    "ulw      %[temp0],      0(%[p_coeffs])             \n\t"
+    "beqz     %[temp0],      1f                         \n\t"
+#if defined(WORDS_BIGENDIAN)
+    " sll     %[temp1],      %[temp0],       16         \n\t"
+#else
+    " srl     %[temp1],      %[temp0],       16         \n\t"
+#endif
+    "addiu    %[n1],         %[n],           -1         \n\t"
+    "movz     %[temp0],      %[n1],          %[temp1]   \n\t"
+    "movn     %[temp0],      %[n],           %[temp1]   \n\t"
+    "j        2f                                        \n\t"
+    " addiu   %[temp2],      %[temp0],       0          \n\t"
+  "1:                                                   \n\t"
+    "addiu    %[n],          %[n],           -2         \n\t"
+    "bgtz     %[n],          0b                         \n\t"
+    " addiu   %[p_coeffs],   %[p_coeffs],    -4         \n\t"
+  "2:                                                   \n\t"
+    ".set     pop                                       \n\t"
+    : [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [n]"=&r"(n), [n1]"=&r"(n1)
+    :
+    : "memory"
+  );
+  res->last = temp2;
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/src/dsp/cost_mips_dsp_r2.c
+++ b/src/dsp/cost_mips_dsp_r2.c
@ -0,0 +1,107 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../enc/cost.h"
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                     \n\t"
+    ".set      noreorder                                                \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                \n\t"
+    "blez      %[temp1],        2f                                      \n\t"
+    " nop                                                               \n\t"
+  "1:                                                                   \n\t"
+    "sll       %[temp0],        %[n],               1                   \n\t"
+    "lhx       %[v_reg],        %[temp0](%[res_coeffs])                 \n\t"
+    "addiu     %[n],            %[n],               1                   \n\t"
+    "absq_s.w  %[v_reg],        %[v_reg]                                \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                   \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]            \n\t"
+    "sll       %[temp1],        %[v_reg],           1                   \n\t"
+    "lhx       %[temp1],        %[temp1](%[VP8LevelFixedCosts])         \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]  \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]            \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]            \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                   \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                   \n\t"
+    "lhx       %[temp0],        %[v_reg](%[t])                          \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]      \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]          \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]            \n\t"
+    "bne       %[n],            %[res_last],        1b                  \n\t"
+    " lw       %[t],            0(%[t])                                 \n\t"
+  "2:                                                                   \n\t"
+    ".set      pop                                                      \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
+  VP8GetResidualCost = GetResidualCost;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of cost functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include "../enc/cost.h"
+#include "../enc/vp8enci.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                  VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE2 to compare 16 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_packs_epi16(c0, c1);
+  const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
+  // Get the comparison results as a bitmask into 16bits. Negate the mask to get
+  // the position of entries that are not equal to zero. We don't need to mask
+  // out least significant bits according to res->first, since coeffs[0] is 0
+  // if res->first > 0.
+  const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? BitsLog2Floor(mask) : -1;
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kCst2 = _mm_set1_epi8(2);
+    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
+    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
+    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
+    const __m128i D0 = _mm_sub_epi16(zero, c0);
+    const __m128i D1 = _mm_sub_epi16(zero, c1);
+    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
+    const __m128i E1 = _mm_max_epi16(c1, D1);
+    const __m128i F = _mm_packs_epi16(E0, E1);
+    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
+    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]
+
+    _mm_storeu_si128((__m128i*)&ctxs[0], G);
+    _mm_storeu_si128((__m128i*)&levels[0], H);
+
+    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
+    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
+  VP8GetResidualCost = GetResidualCostSSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -13,7 +13,12 @@

 #include "./dsp.h"

-#if defined(__ANDROID__)
+#if defined(WEBP_HAVE_NEON_RTCD)
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#if defined(WEBP_ANDROID_NEON)
 #include <cpu-features.h>
 #endif

@ -31,6 +36,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type), "c"(0));
 }
+#elif defined(__x86_64__) && \
+      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
+      defined(__PIC__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "xchg{q}\t{%%rbx}, %q1\n"
+    "cpuid\n"
+    "xchg{q}\t{%%rbx}, %q1\n"
+    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
+      "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
@ -79,7 +96,16 @@ static WEBP_INLINE uint64_t xgetbv(void) {

 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
+  int max_cpuid_value;
  int cpu_info[4];
+
+  // get the highest feature value cpuid supports
+  GetCPUInfo(cpu_info, 0);
+  max_cpuid_value = cpu_info[0];
+  if (max_cpuid_value < 1) {
+    return 0;
+  }
+
  GetCPUInfo(cpu_info, 1);
  if (feature == kSSE2) {
    return 0 != (cpu_info[3] & 0x04000000);
@ -87,6 +113,9 @@ static int x86CPUInfo(CPUFeature feature) {
  if (feature == kSSE3) {
    return 0 != (cpu_info[2] & 0x00000001);
  }
+  if (feature == kSSE4_1) {
+    return 0 != (cpu_info[2] & 0x00080000);
+  }
  if (feature == kAVX) {
    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
@ -95,7 +124,7 @@ static int x86CPUInfo(CPUFeature feature) {
    }
  }
  if (feature == kAVX2) {
-    if (x86CPUInfo(kAVX)) {
+    if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
      GetCPUInfo(cpu_info, 7);
      return ((cpu_info[1] & 0x00000020) == 0x00000020);
    }
@ -118,14 +147,38 @@ VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
 static int armCPUInfo(CPUFeature feature) {
-  (void)feature;
+  if (feature != kNEON) return 0;
+#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
+  {
+    int has_neon = 0;
+    char line[200];
+    FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo == NULL) return 0;
+    while (fgets(line, sizeof(line), cpuinfo)) {
+      if (!strncmp(line, "Features", 8)) {
+        if (strstr(line, " neon ") != NULL) {
+          has_neon = 1;
+          break;
+        }
+      }
+    }
+    fclose(cpuinfo);
+    return has_neon;
+  }
+#else
  return 1;
+#endif
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
-#elif defined(WEBP_USE_MIPS32)
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
+      defined(WEBP_USE_MSA)
 static int mipsCPUInfo(CPUFeature feature) {
-  (void)feature;
-  return 1;
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) {
+    return 1;
+  } else {
+    return 0;
+  }
+
 }
 VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -7,12 +7,13 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Speed-critical decoding functions.
+// Speed-critical decoding functions, default plain-C implementations.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #include "./dsp.h"
 #include "../dec/vp8i.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------

@ -34,9 +35,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
  STORE(3, y, DC - (d));            \
 } while (0)

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
+#define MUL1(a) ((((a) * 20091) >> 16) + (a))
+#define MUL2(a) (((a) * 35468) >> 16)

 static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
@ -45,8 +45,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  for (i = 0; i < 4; ++i) {    // vertical pass
    const int a = in[0] + in[8];    // [-4096, 4094]
    const int b = in[0] - in[8];    // [-4095, 4095]
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    const int c = MUL2(in[4]) - MUL1(in[12]);   // [-3783, 3783]
+    const int d = MUL1(in[4]) + MUL2(in[12]);   // [-3785, 3781]
    tmp[0] = a + d;   // [-7881, 7875]
    tmp[1] = b + c;   // [-7878, 7878]
    tmp[2] = b - c;   // [-7878, 7878]
@ -55,7 +55,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    in++;
  }
  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
-  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // The exact value is (2. + (20091 + 35468) / 65536).
  // After the second pass, maximum interval is [-3794, 3794], assuming
  // an input in [-2048, 2047] interval. We then need to add a dst value
  // in the [0, 255] range.
@ -66,8 +66,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    const int dc = tmp[0] + 4;
    const int a =  dc +  tmp[8];
    const int b =  dc -  tmp[8];
-    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
-    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
+    const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
    STORE(0, 0, a + d);
    STORE(1, 0, b + c);
    STORE(2, 0, b - c);
@ -80,16 +80,17 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
-  const int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  const int c4 = MUL2(in[4]);
+  const int d4 = MUL1(in[4]);
+  const int c1 = MUL2(in[1]);
+  const int d1 = MUL1(in[1]);
  STORE2(0, a + d4, d1, c1);
  STORE2(1, a + c4, d1, c1);
  STORE2(2, a - c4, d1, c1);
  STORE2(3, a - d4, d1, c1);
 }
-#undef MUL
+#undef MUL1
+#undef MUL2
 #undef STORE2

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
@ -104,7 +105,7 @@ static void TransformUV(const int16_t* in, uint8_t* dst) {
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

-static void TransformDC(const int16_t *in, uint8_t* dst) {
+static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
@ -160,7 +161,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

 #define DST(x, y) dst[(x) + (y) * BPS]

-static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = VP8kclip1 - top[-1];
  int y;
@ -173,21 +174,21 @@ static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
    dst += BPS;
  }
 }
-static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }

 //------------------------------------------------------------------------------
 // 16x16

-static void VE16(uint8_t *dst) {     // vertical
+static void VE16(uint8_t* dst) {     // vertical
  int j;
  for (j = 0; j < 16; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 16);
  }
 }

-static void HE16(uint8_t *dst) {     // horizontal
+static void HE16(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    memset(dst, dst[-1], 16);
@ -202,7 +203,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  }
 }

-static void DC16(uint8_t *dst) {    // DC
+static void DC16(uint8_t* dst) {    // DC
  int DC = 16;
  int j;
  for (j = 0; j < 16; ++j) {
@ -211,7 +212,7 @@ static void DC16(uint8_t *dst) {    // DC
  Put16(DC >> 5, dst);
 }

-static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
@ -220,7 +221,7 @@ static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
  int DC = 8;
  int i;
  for (i = 0; i < 16; ++i) {
@ -229,17 +230,19 @@ static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }

+VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
+
 //------------------------------------------------------------------------------
 // 4x4

 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

-static void VE4(uint8_t *dst) {    // vertical
+static void VE4(uint8_t* dst) {    // vertical
  const uint8_t* top = dst - BPS;
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
@ -253,19 +256,19 @@ static void VE4(uint8_t *dst) {    // vertical
  }
 }

-static void HE4(uint8_t *dst) {    // horizontal
+static void HE4(uint8_t* dst) {    // horizontal
  const int A = dst[-1 - BPS];
  const int B = dst[-1];
  const int C = dst[-1 + BPS];
  const int D = dst[-1 + 2 * BPS];
  const int E = dst[-1 + 3 * BPS];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }

-static void DC4(uint8_t *dst) {   // DC
+static void DC4(uint8_t* dst) {   // DC
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@ -273,7 +276,7 @@ static void DC4(uint8_t *dst) {   // DC
  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }

-static void RD4(uint8_t *dst) {   // Down-right
+static void RD4(uint8_t* dst) {   // Down-right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -284,15 +287,15 @@ static void RD4(uint8_t *dst) {   // Down-right
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
  DST(0, 3)                                     = AVG3(J, K, L);
-  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
-  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
-  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
-  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
-  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
-  DST(3, 0)                                     = AVG3(D, C, B);
+  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+                                      DST(3, 0) = AVG3(D, C, B);
 }

-static void LD4(uint8_t *dst) {   // Down-Left
+static void LD4(uint8_t* dst) {   // Down-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@ -305,12 +308,12 @@ static void LD4(uint8_t *dst) {   // Down-Left
  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
-  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
-  DST(3, 3)                                     = AVG3(G, H, H);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = AVG3(G, H, H);
 }

-static void VR4(uint8_t *dst) {   // Vertical-Right
+static void VR4(uint8_t* dst) {   // Vertical-Right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -332,7 +335,7 @@ static void VR4(uint8_t *dst) {   // Vertical-Right
  DST(3, 1) =             AVG3(B, C, D);
 }

-static void VL4(uint8_t *dst) {   // Vertical-Left
+static void VL4(uint8_t* dst) {   // Vertical-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@ -354,7 +357,7 @@ static void VL4(uint8_t *dst) {   // Vertical-Left
              DST(3, 3) = AVG3(F, G, H);
 }

-static void HU4(uint8_t *dst) {   // Horizontal-Up
+static void HU4(uint8_t* dst) {   // Horizontal-Up
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -369,7 +372,7 @@ static void HU4(uint8_t *dst) {   // Horizontal-Up
    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static void HD4(uint8_t *dst) {  // Horizontal-Down
+static void HD4(uint8_t* dst) {  // Horizontal-Down
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -396,17 +399,19 @@ static void HD4(uint8_t *dst) {  // Horizontal-Down
 #undef AVG3
 #undef AVG2

+VP8PredFunc VP8PredLuma4[NUM_BMODES];
+
 //------------------------------------------------------------------------------
 // Chroma

-static void VE8uv(uint8_t *dst) {    // vertical
+static void VE8uv(uint8_t* dst) {    // vertical
  int j;
  for (j = 0; j < 8; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 8);
  }
 }

-static void HE8uv(uint8_t *dst) {    // horizontal
+static void HE8uv(uint8_t* dst) {    // horizontal
  int j;
  for (j = 0; j < 8; ++j) {
    memset(dst, dst[-1], 8);
@ -422,7 +427,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  }
 }

-static void DC8uv(uint8_t *dst) {     // DC
+static void DC8uv(uint8_t* dst) {     // DC
  int dc0 = 8;
  int i;
  for (i = 0; i < 8; ++i) {
@ -431,7 +436,7 @@ static void DC8uv(uint8_t *dst) {     // DC
  Put8x8uv(dc0 >> 4, dst);
 }

-static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@ -440,7 +445,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@ -449,26 +454,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
  Put8x8uv(0x80, dst);
 }

-//------------------------------------------------------------------------------
-// default C implementations
-
-const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
-  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
-};
-
-const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
-  DC16, TM16, VE16, HE16,
-  DC16NoTop, DC16NoLeft, DC16NoTopLeft
-};
-
-const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
-  DC8uv, TM8uv, VE8uv, HE8uv,
-  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
-};
+VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];

 //------------------------------------------------------------------------------
 // Edge filtering functions
@ -665,6 +655,23 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,

 //------------------------------------------------------------------------------

+static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
+                             int dst_stride) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      const int delta0 = dither[i] - VP8_DITHER_AMP_CENTER;
+      const int delta1 =
+          (delta0 + VP8_DITHER_DESCALE_ROUNDER) >> VP8_DITHER_DESCALE;
+      dst[i] = clip_8b((int)dst[i] + delta1);
+    }
+    dst += dst_stride;
+    dither += 8;
+  }
+}
+
+//------------------------------------------------------------------------------
+
 VP8DecIdct2 VP8Transform;
 VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
@ -684,14 +691,20 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;

+void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
+                            int dst_stride);
+
 extern void VP8DspInitSSE2(void);
+extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
+extern void VP8DspInitMIPSdspR2(void);
+extern void VP8DspInitMSA(void);

 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;

-void VP8DspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;

  VP8InitClipTables();
@ -716,20 +729,66 @@ void VP8DspInit(void) {
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;

+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[3] = HE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+  VP8PredLuma4[8] = HD4;
+  VP8PredLuma4[9] = HU4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+
+  VP8DitherCombine8x8 = DitherCombine8x8;
+
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8DspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8DspInitSSE41();
+      }
+#endif
    }
-#elif defined(WEBP_USE_NEON)
+#endif
+#if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8DspInitNEON();
    }
-#elif defined(WEBP_USE_MIPS32)
+#endif
+#if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8DspInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8DspInitMSA();
+    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
--- a/src/dsp/dec_clip_tables.c
+++ b/src/dsp/dec_clip_tables.c
@ -344,7 +344,7 @@ const int8_t* const VP8ksclip2 = &sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];

-void VP8InitClipTables(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
 #if !defined(USE_STATIC_TABLES)
  int i;
  if (!tables_ok) {
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -16,6 +16,8 @@

 #if defined(WEBP_USE_MIPS32)

+#include "./mips_macro.h"
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;

@ -52,6 +54,7 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
@ -68,9 +71,9 @@ static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
 }

-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) <= thresh);
+  return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
 }

 static WEBP_INLINE int needs_filter2(const uint8_t* p,
@ -78,7 +81,7 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
  const int p3 = p[-4 * step], p2 = p[-3 * step];
  const int p1 = p[-2 * step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
-  if ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) > t) {
+  if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
    return 0;
  }
  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
@ -89,8 +92,9 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -104,8 +108,9 @@ static WEBP_INLINE void FilterLoop26(uint8_t* p,
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
@ -176,8 +181,9 @@ static void HFilter16i(uint8_t* p, int stride,

 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
+  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
+    if (needs_filter(p + i, stride, thresh2)) {
      do_filter2(p + i, stride);
    }
  }
@ -185,8 +191,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {

 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
+  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
      do_filter2(p + i * stride, 1);
    }
  }
@ -384,7 +391,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "sra      %[temp7],  %[temp7],  3                  \n\t"
    "sra      %[temp4],  %[temp4],  3                  \n\t"
    "addiu    %[temp6],  $zero,     255                \n\t"
-    "lbu      %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
    "sra      %[temp5],  %[temp1],  8                  \n\t"
    "sra      %[temp18], %[temp1],  31                 \n\t"
@ -392,8 +399,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
  "1:                                                  \n\t"
-    "lbu      %[temp18], 1(%[dst])                     \n\t"
-    "sb       %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
    "sra      %[temp11], %[temp18], 8                  \n\t"
    "sra      %[temp1],  %[temp18], 31                 \n\t"
@ -401,8 +408,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
  "2:                                                  \n\t"
-    "lbu      %[temp1],  2(%[dst])                     \n\t"
-    "sb       %[temp18], 1(%[dst])                     \n\t"
+    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
    "sra      %[temp8],  %[temp1],  8                  \n\t"
    "sra      %[temp18], %[temp1],  31                 \n\t"
@ -410,8 +417,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
  "3:                                                  \n\t"
-    "lbu      %[temp18], 3(%[dst])                     \n\t"
-    "sb       %[temp1],  2(%[dst])                     \n\t"
+    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
    "sra      %[temp16], %[temp18], 8                  \n\t"
    "sra      %[temp1],  %[temp18], 31                 \n\t"
@ -419,11 +426,11 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
  "4:                                                  \n\t"
-    "sb       %[temp18], 3(%[dst])                     \n\t"
-    "lbu      %[temp5],  32(%[dst])                    \n\t"
-    "lbu      %[temp8],  33(%[dst])                    \n\t"
-    "lbu      %[temp11], 34(%[dst])                    \n\t"
-    "lbu      %[temp16], 35(%[dst])                    \n\t"
+    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
@ -452,14 +459,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
  "8:                                                  \n\t"
-    "sb       %[temp5],  32(%[dst])                    \n\t"
-    "sb       %[temp8],  33(%[dst])                    \n\t"
-    "sb       %[temp11], 34(%[dst])                    \n\t"
-    "sb       %[temp16], 35(%[dst])                    \n\t"
-    "lbu      %[temp5],  64(%[dst])                    \n\t"
-    "lbu      %[temp8],  65(%[dst])                    \n\t"
-    "lbu      %[temp11], 66(%[dst])                    \n\t"
-    "lbu      %[temp16], 67(%[dst])                    \n\t"
+    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
@ -488,14 +495,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
  "12:                                                 \n\t"
-    "sb       %[temp5],  64(%[dst])                    \n\t"
-    "sb       %[temp8],  65(%[dst])                    \n\t"
-    "sb       %[temp11], 66(%[dst])                    \n\t"
-    "sb       %[temp16], 67(%[dst])                    \n\t"
-    "lbu      %[temp5],  96(%[dst])                    \n\t"
-    "lbu      %[temp8],  97(%[dst])                    \n\t"
-    "lbu      %[temp11], 98(%[dst])                    \n\t"
-    "lbu      %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
@ -524,10 +531,10 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
  "16:                                                 \n\t"
-    "sb       %[temp5],  96(%[dst])                    \n\t"
-    "sb       %[temp8],  97(%[dst])                    \n\t"
-    "sb       %[temp11], 98(%[dst])                    \n\t"
-    "sb       %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"

    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@ -548,15 +555,12 @@ static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  }
 }

-#endif  // WEBP_USE_MIPS32
-
 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8DspInitMIPS32(void);

-void VP8DspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
  VP8InitClipTables();

  VP8Transform = TransformTwo;
@ -574,5 +578,10 @@ void VP8DspInitMIPS32(void) {
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@ -0,0 +1,994 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    "lh               %[temp5],  0(%[in])               \n\t"
+    "addiu            %[temp5],  %[temp5],  4           \n\t"
+    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
+    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
+    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+                            temp3, temp1, temp2, temp3, temp4)
+    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_10()
+    : [in]"r"(in), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ins              %[c4],      %[d4],     16,       16    \n\t"
+    "replv.ph         %[temp1],   %[a]                       \n\t"
+    "replv.ph         %[temp4],   %[d1]                      \n\t"
+    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
+    "replv.ph         %[temp5],   %[c1]                      \n\t"
+    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+                            temp11, temp17, temp3, temp5, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+                          temp4, temp7, temp6, temp10, temp9)
+    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [c4]"+&r"(c4)
+    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
+    : "memory"
+  );
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15;
+
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+  "1:                                                    \n\t"
+    "negu      %[temp1],  %[hstride]                     \n\t"
+    "addiu     %[size],   %[size],        -1             \n\t"
+    "sll       %[temp2],  %[hstride],     1              \n\t"
+    "sll       %[temp3],  %[temp1],       1              \n\t"
+    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
+    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
+    "lbu       %[temp7],  0(%[p])                        \n\t"
+    "sll       %[temp6],  %[temp3],       1              \n\t"
+    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
+    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
+    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
+    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
+    "lbux      %[temp12], %[hstride](%[p])               \n\t"
+    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
+    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
+    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
+    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
+    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
+    "negu      %[temp1],  %[temp1]                       \n\t"
+    "sll       %[temp3],  %[temp3],       2              \n\t"
+    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
+    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
+    "sll       %[temp6],  %[temp1],       1              \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
+    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
+    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
+    "bgtz      %[temp4],  3f                             \n\t"
+    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
+    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
+    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "sra       %[temp4],  %[temp2],       24             \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
+    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
+    "absq_s.w  %[temp15], %[temp15]                      \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
+    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
+    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
+    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
+    "beqz      %[temp2],  4f                             \n\t"
+    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
+    "addiu     %[temp2],  %[temp5],       3              \n\t"
+    "sra       %[temp2],  %[temp2],       3              \n\t"
+    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
+    "sra       %[temp1],  %[temp1],       27             \n\t"
+    "sra       %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
+    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
+    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
+    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp3])                    \n\t"
+    "j         3f                                        \n\t"
+    " sb       %[temp1],  0(%[p])                        \n\t"
+  "4:                                                    \n\t"
+    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
+    "subu      %[temp14], %[p],           %[hstride]     \n\t"
+    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
+    "sra       %[temp6],  %[temp5],       24             \n\t"
+    "sll       %[temp1],  %[temp6],       3              \n\t"
+    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
+    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
+    "sll       %[temp3],  %[temp2],       1              \n\t"
+    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
+    "addiu     %[temp2],  %[temp2],       63             \n\t"
+    "addiu     %[temp3],  %[temp3],       63             \n\t"
+    "addiu     %[temp4],  %[temp4],       63             \n\t"
+    "sra       %[temp2],  %[temp2],       7              \n\t"
+    "sra       %[temp3],  %[temp3],       7              \n\t"
+    "sra       %[temp4],  %[temp4],       7              \n\t"
+    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
+    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
+    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
+    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
+    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
+    "addu      %[temp10], %[p],           %[hstride]     \n\t"
+    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
+    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
+    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
+    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
+    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
+    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
+    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
+    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp15])                   \n\t"
+    "sb        %[temp3],  0(%[temp11])                   \n\t"
+    "sb        %[temp4],  0(%[temp14])                   \n\t"
+    "sb        %[temp5],  0(%[p])                        \n\t"
+    "sb        %[temp6],  0(%[temp10])                   \n\t"
+    "sb        %[temp8],  0(%[temp12])                   \n\t"
+  "3:                                                    \n\t"
+    "bgtz      %[size],   1b                             \n\t"
+    " addu     %[p],      %[p],           %[vstride]     \n\t"
+    ".set      pop                                       \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
+      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
+      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
+      [size]"+&r"(size), [p]"+&r"(p)
+    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
+      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
+      [VP8kclip1]"r"(VP8kclip1)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  int p0, q0, p1, q1, p2, q2, p3, q3;
+  int step1, step2, temp1, temp2, temp3, temp4;
+  uint8_t* pTemp0;
+  uint8_t* pTemp1;
+  const int thresh2 = 2 * thresh + 1;
+
+  __asm__ volatile (
+    ".set      push                                   \n\t"
+    ".set      noreorder                              \n\t"
+    "bltz      %[size],    3f                         \n\t"
+    " nop                                             \n\t"
+  "2:                                                 \n\t"
+    "negu      %[step1],   %[hstride]                 \n\t"
+    "lbu       %[q0],      0(%[p])                    \n\t"
+    "lbux      %[p0],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q1],      %[hstride](%[p])           \n\t"
+    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
+    "lbux      %[p1],      %[step1](%[p])             \n\t"
+    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
+    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
+    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
+    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
+    "sll       %[temp2],   %[temp2],      2           \n\t"
+    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
+    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " lbux     %[p2],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q2],      %[step2](%[p])             \n\t"
+    "lbux      %[p3],      %[step1](%[p])             \n\t"
+    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
+    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
+    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
+    "lbux      %[q3],      %[step2](%[p])             \n\t"
+    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "negu      %[temp1],   %[temp1]                   \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "subu      %[p3],      %[p1],         %[p0]       \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " absq_s.w %[p3],      %[p3]                      \n\t"
+    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
+    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
+    "sll       %[step1],   %[temp1],      1           \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " absq_s.w %[temp2],   %[temp2]                   \n\t"
+    "subu      %[q3],      %[q1],         %[q0]       \n\t"
+    "absq_s.w  %[q3],      %[q3]                      \n\t"
+    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
+    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
+    "or        %[q3],      %[q3],         %[p3]       \n\t"
+    "bgtz      %[q3],      1f                         \n\t"
+    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addiu     %[step1],   %[temp2],      1           \n\t"
+    "sra       %[step1],   %[step1],      1           \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "addu      %[p1],      %[p1],         %[step1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "subu      %[q1],      %[q1],         %[step1]    \n\t"
+    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[pTemp0])               \n\t"
+    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
+    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
+    "sb        %[temp3],   0(%[p])                    \n\t"
+    "sb        %[temp4],   0(%[pTemp1])               \n\t"
+    "j         0f                                     \n\t"
+    " sb       %[temp1],   0(%[pTemp0])               \n\t"
+  "1:                                                 \n\t"
+    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
+    "sra       %[temp3],   %[temp3],      24          \n\t"
+    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
+    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[p])                    \n\t"
+    "sb        %[temp1],   0(%[pTemp0])               \n\t"
+  "0:                                                 \n\t"
+    "subu      %[size],    %[size],       1           \n\t"
+    "bgtz      %[size],    2b                         \n\t"
+    " addu     %[p],       %[p],          %[vstride]  \n\t"
+  "3:                                                 \n\t"
+    ".set      pop                                    \n\t"
+    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
+      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
+      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
+      [size]"+&r"(size)
+    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
+      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
+      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+#undef MUL
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  uint8_t* p1 = p - stride;
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+    "li        %[i],        16                           \n\t"
+  "0:                                                    \n\t"
+    "negu      %[temp4],    %[stride]                    \n\t"
+    "sll       %[temp5],    %[temp4],       1            \n\t"
+    "lbu       %[temp2],    0(%[p])                      \n\t"
+    "lbux      %[temp3],    %[stride](%[p])              \n\t"
+    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
+    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
+    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
+    "sll       %[temp4],    %[temp4],       2            \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
+    "negu      %[temp8],    %[temp7]                     \n\t"
+    "bgtz      %[temp5],    1f                           \n\t"
+    " addiu    %[i],        %[i],           -1           \n\t"
+    "sll       %[temp4],    %[temp8],       1            \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
+    "sra       %[temp5],    %[temp5],       24           \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
+    "addiu     %[temp7],    %[temp3],       3            \n\t"
+    "sra       %[temp7],    %[temp7],       3            \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
+    "sra       %[temp0],    %[temp0],       27           \n\t"
+    "sra       %[temp4],    %[temp4],       27           \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
+    "sb        %[temp3],    0(%[p1])                     \n\t"
+    "sb        %[temp4],    0(%[p])                      \n\t"
+  "1:                                                    \n\t"
+    "addiu     %[p1],       %[p1],          1            \n\t"
+    "bgtz      %[i],        0b                           \n\t"
+    " addiu    %[p],        %[p],           1            \n\t"
+    " .set     pop                                       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A + A1 * BPS]
+// TEMP1 = SRC[B + B1 * BPS]
+// TEMP2 = SRC[C + C1 * BPS]
+// TEMP3 = SRC[D + D1 * BPS]
+#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
+                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
+  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    ".set      push                                     \n\t"
+    ".set      noreorder                                \n\t"
+    "li        %[i],       16                           \n\t"
+  "0:                                                   \n\t"
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
+    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
+    "sll       %[temp4],    %[temp4],       2           \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
+    "negu      %[temp8],    %[temp7]                    \n\t"
+    "bgtz      %[temp5],    1f                          \n\t"
+    " addiu    %[i],        %[i],           -1          \n\t"
+    "sll       %[temp4],    %[temp8],       1           \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
+    "sra       %[temp5],    %[temp5],       24          \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
+    "addiu     %[temp7],    %[temp3],       3           \n\t"
+    "sra       %[temp7],    %[temp7],       3           \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
+    "sra       %[temp0],    %[temp0],       27          \n\t"
+    "sra       %[temp4],    %[temp4],       27          \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
+    "sb        %[temp3],    -1(%[p])                    \n\t"
+    "sb        %[temp4],    0(%[p])                     \n\t"
+  "1:                                                   \n\t"
+    "bgtz      %[i],        0b                          \n\t"
+    " addu     %[p],        %[p],           %[stride]   \n\t"
+    ".set      pop                                      \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+// DST[A * BPS]     = TEMP0
+// DST[B + C * BPS] = TEMP1
+#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
+  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
+  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile (
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
+    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC4(uint8_t* dst) {   // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  __asm__ volatile (
+    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
+    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
+    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
+    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
+    "replv.qb     %[temp0],   %[temp0]                 \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
+    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
+    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
+    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
+    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
+    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
+    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
+    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
+    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
+    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
+    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
+    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
+    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
+    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
+    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
+    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
+    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
+    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
+    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
+    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
+    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
+    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
+    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
+    "prepend        %[temp2],   %[temp8], 8                    \n\t"
+    "prepend        %[temp6],   %[temp4], 8                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A * BPS]
+// TEMP1 = SRC[B + C * BPS]
+#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
+  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
+  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
+    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
+    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
+    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
+    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
+    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
+    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
+    "srl             %[temp1],    %[temp1],    24              \n\t"
+    "sll             %[temp1],    %[temp1],    1               \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
+    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
+    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
+    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
+    "prepend         %[temp9],    %[temp0],    8               \n\t"
+    "prepend         %[temp3],    %[temp1],    8               \n\t"
+    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int temp0, temp1;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+#undef LOAD_8_BYTES
+#undef STORE_8_BYTES
+#undef LOAD_4_BYTES
+
+#define CLIPPING(SIZE)                                                         \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+".endif                                                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
+".endif                                                  \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+".endif                                                  \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
+".endif                                                  \n\t"
+
+
+#define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
+  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+  ".if " #SIZE " < 8                                     \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(4)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+  ".else                                                 \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, SIZE) do {                                            \
+  int y;                                                                       \
+  const uint8_t* top = (DST) - BPS;                                            \
+  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, SIZE)                                                 \
+static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
+  CLIP_TO_DST((DST), (SIZE));                                                  \
+}
+
+TRUE_MOTION(dst, 4)
+TRUE_MOTION(dst, 8)
+TRUE_MOTION(dst, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TrueMotion4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TrueMotion8;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+
+  VP8PredLuma16[1] = TrueMotion16;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@ -0,0 +1,172 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of dsp functions
+//
+// Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
+
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+  v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);           \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                 \
+                                                                 \
+  a1_m = in0 + in2;                                              \
+  b1_m = in0 - in2;                                              \
+  c_tmp1_m = (in1 * sinpi8sqrt2) >> 16;                          \
+  c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16);            \
+  c1_m = c_tmp1_m - c_tmp2_m;                                    \
+  d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16);            \
+  d_tmp2_m = (in3 * sinpi8sqrt2) >> 16;                          \
+  d1_m = d_tmp1_m + d_tmp2_m;                                    \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+#define MULT1(a) ((((a) * 20091) >> 16) + (a))
+#define MULT2(a) (((a) * 35468) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  const v16i8 zero = { 0 };
+  v16i8 dest0, dest1, dest2, dest3;
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  v8i16 input0, input1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+
+  LD_SH2(in, 8, input0, input1);
+  input1 = SLDI_SH(input1, input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);
+  SRAI_H2_SH(out0, out1, 3);
+  out[0] = __msa_copy_s_h(out0, 0);
+  out[16] = __msa_copy_s_h(out0, 4);
+  out[32] = __msa_copy_s_h(out1, 0);
+  out[48] = __msa_copy_s_h(out1, 4);
+  out[64] = __msa_copy_s_h(out0, 1);
+  out[80] = __msa_copy_s_h(out0, 5);
+  out[96] = __msa_copy_s_h(out1, 1);
+  out[112] = __msa_copy_s_h(out1, 5);
+  out[128] = __msa_copy_s_h(out0, 2);
+  out[144] = __msa_copy_s_h(out0, 6);
+  out[160] = __msa_copy_s_h(out1, 2);
+  out[176] = __msa_copy_s_h(out1, 6);
+  out[192] = __msa_copy_s_h(out0, 3);
+  out[208] = __msa_copy_s_h(out0, 7);
+  out[224] = __msa_copy_s_h(out1, 3);
+  out[240] = __msa_copy_s_h(out1, 7);
+}
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = (in[0] + 4) >> 3;
+  const v8i16 tmp0 = __msa_fill_h(DC);
+  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MULT2(in[4]);
+  const int d4 = MULT1(in[4]);
+  const int in2 = MULT2(in[1]);
+  const int in3 = MULT1(in[1]);
+  v4i32 tmp0 = { 0 };
+  v4i32 out0 = __msa_fill_w(a + d4);
+  v4i32 out1 = __msa_fill_w(a + c4);
+  v4i32 out2 = __msa_fill_w(a - c4);
+  v4i32 out3 = __msa_fill_w(a - d4);
+  v4i32 res0, res1, res2, res3;
+  const v4i32 zero = { 0 };
+  v16u8 dest0, dest1, dest2, dest3;
+
+  INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);
+  ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,
+       out0, out1, out2, out3);
+  SRAI_W4_SW(out0, out1, out2, out3, 3);
+  LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8DspInitMSA)
+
+#endif  // WEBP_USE_MSA
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -24,24 +24,24 @@

 // Load/Store vertical edge
 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"

 #define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"

 #if !defined(WORK_AROUND_GCC)

@ -389,9 +389,9 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,

 #endif  // !WORK_AROUND_GCC

-// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
-  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+// Zero extend 'v' to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(v));
 }

 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
@ -423,8 +423,8 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,

  {
    // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
+    const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));

    // Descale with rounding.
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
@ -479,6 +479,21 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {

 //------------------------------------------------------------------------------

+static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
+                               const int8x16_t delta,
+                               int8x16_t* const op0, int8x16_t* const oq0) {
+  const int8x16_t kCst3 = vdupq_n_s8(0x03);
+  const int8x16_t kCst4 = vdupq_n_s8(0x04);
+  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
+  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
+  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
+  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
+  *op0 = vqaddq_s8(p0s, delta3);
+  *oq0 = vqsubq_s8(q0s, delta4);
+}
+
+#if defined(WEBP_USE_INTRINSICS)
+
 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
                         const int8x16_t delta,
                         uint8x16_t* const op0, uint8x16_t* const oq0) {
@ -494,8 +509,6 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
  *oq0 = FlipSignBack(sq0);
 }

-#if defined(USE_INTRINSICS)
-
 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
                      const uint8x16_t q0, const uint8x16_t q1,
                      const uint8x16_t mask,
@ -626,7 +639,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  );
 }

-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS

 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  uint32_t k;
@ -721,11 +734,7 @@ static void DoFilter4(
    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
    const int8x16_t simple_lf_delta =
        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
-    uint8x16_t tmp_p0, tmp_q0;
-    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
-    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
-    p0s = FlipSign(tmp_p0);
-    q0s = FlipSign(tmp_q0);
+    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
  }

  // do_filter4 part (complex loopfilter on pixels without hev)
@ -797,11 +806,7 @@ static void DoFilter6(
  {
    const int8x16_t simple_lf_delta =
        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
-    uint8x16_t tmp_p0, tmp_q0;
-    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
-    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
-    p0s = FlipSign(tmp_p0);
-    q0s = FlipSign(tmp_q0);
+    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
  }

  // do_filter6 part (complex loopfilter on pixels without hev)
@ -986,7 +991,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 static const int16_t kC1 = 20091;
 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.

-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)
 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
                                     int16x8x2_t* const out) {
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
@ -1163,7 +1168,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  );
 }

-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
@ -1241,7 +1246,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  static const int kC1_full = 20091 + (1 << 16);
  static const int kC2_full = 35468;
-  const int16x4_t A = vdup_n_s16(in[0]);
+  const int16x4_t A = vld1_dup_s16(in);
  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
  const int c1 = MUL(in[1], kC2_full);
@ -1258,15 +1263,330 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 }
 #undef MUL

-#endif   // WEBP_USE_NEON
+//------------------------------------------------------------------------------
+// 4x4
+
+static void DC4(uint8_t* dst) {    // DC
+  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+  const uint16x4_t p1 = vpadd_u16(p0, p0);
+  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+  const uint16x8_t s0 = vaddq_u16(L0, L1);
+  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint16x8_t s01 = vaddq_u16(s0, s1);
+  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
+  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
+  }
+}
+
+// TrueMotion (4x4 + 8x8)
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
+  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
+  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
+  int y;
+  for (y = 0; y < size; y += 4) {
+    // left edge
+    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1 = vaddq_s16(L1, d);
+    const int16x8_t r2 = vaddq_s16(L2, d);
+    const int16x8_t r3 = vaddq_s16(L3, d);
+    // Saturate and store the result.
+    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
+    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
+    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
+    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
+    if (size == 4) {
+      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
+    } else {
+      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
+      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
+      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
+      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
+    }
+    dst += 4 * BPS;
+  }
+}
+
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+
+static void VE4(uint8_t* dst) {    // vertical
+  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
+  }
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32_t I = dst[-1 + 0 * BPS];
+  const uint32_t J = dst[-1 + 1 * BPS];
+  const uint32_t K = dst[-1 + 2 * BPS];
+  const uint32_t L = dst[-1 + 3 * BPS];
+  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+static void LD4(uint8_t* dst) {    // Down-left
+  // Note using the same shift trick as VE4() is slower here.
+  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
+  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
+  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
+  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  const uint8x8_t top = vld1_u8(dst - BPS);
+  int j;
+  for (j = 0; j < 8; ++j) {
+    vst1_u8(dst + j * BPS, top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const uint8x8_t left = vld1_dup_u8(dst - 1);
+    vst1_u8(dst, left);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_top) {
+    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
+    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
+    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
+    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
+    const uint16x8_t s0 = vaddq_u16(L0, L1);
+    const uint16x8_t s1 = vaddq_u16(L2, L3);
+    const uint16x8_t s2 = vaddq_u16(L4, L5);
+    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint16x8_t s01 = vaddq_u16(s0, s1);
+    const uint16x8_t s23 = vaddq_u16(s2, s3);
+    sum_left = vaddq_u16(s01, s23);
+  }
+
+  if (do_top && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_top) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
+static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
+static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
+static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+
+//------------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t* dst) {     // vertical
+  const uint8x16_t top = vld1q_u8(dst - BPS);
+  int j;
+  for (j = 0; j < 16; ++j) {
+    vst1q_u8(dst + j * BPS, top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 0; j < 16; ++j) {
+    const uint8x16_t left = vld1q_dup_u8(dst - 1);
+    vst1q_u8(dst, left);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_top) {
+    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    int i;
+    sum_left = vdupq_n_u16(0);
+    for (i = 0; i < 16; i += 8) {
+      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
+      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
+      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
+      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
+      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
+      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
+      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
+      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
+      const uint16x8_t s0 = vaddq_u16(L0, L1);
+      const uint16x8_t s1 = vaddq_u16(L2, L3);
+      const uint16x8_t s2 = vaddq_u16(L4, L5);
+      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint16x8_t s01 = vaddq_u16(s0, s1);
+      const uint16x8_t s23 = vaddq_u16(s2, s3);
+      const uint16x8_t sum = vaddq_u16(s01, s23);
+      sum_left = vaddq_u16(sum_left, sum);
+    }
+  }
+
+  if (do_top && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_top) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * BPS, dc);
+    }
+  }
+}
+
+static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
+static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
+static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
+static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+
+static void TM16(uint8_t* dst) {
+  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
+  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
+  // A[c] - A[-1]
+  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
+  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
+  int y;
+  for (y = 0; y < 16; y += 4) {
+    // left edge
+    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
+    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
+    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
+    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
+    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
+    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
+    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
+    // Saturate and store the result.
+    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
+    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
+    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
+    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
+    vst1q_u8(dst + 0 * BPS, row0);
+    vst1q_u8(dst + 1 * BPS, row1);
+    vst1q_u8(dst + 2 * BPS, row2);
+    vst1q_u8(dst + 3 * BPS, row3);
+    dst += 4 * BPS;
+  }
+}

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8DspInitNEON(void);

-void VP8DspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
  VP8Transform = TransformTwo;
  VP8TransformAC3 = TransformAC3;
  VP8TransformDC = TransformDC;
@ -1288,5 +1608,32 @@ void VP8DspInitNEON(void) {
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif   // WEBP_USE_NEON
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredLuma16[0] = DC16TopLeft;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8DspInitNEON)
+
+#endif  // WEBP_USE_NEON
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -21,7 +21,9 @@
 // #define USE_TRANSFORM_AC3

 #include <emmintrin.h>
+#include "./common_sse2.h"
 #include "../dec/vp8i.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@ -52,19 +54,19 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
  // vectors will just contain random value we'll never use nor store.
  __m128i in0, in1, in2, in3;
  {
-    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
-    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
-    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
-    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
    // a00 a10 a20 a30   x x x x
    // a01 a11 a21 a31   x x x x
    // a02 a12 a22 a32   x x x x
    // a03 a13 a23 a33   x x x x
    if (do_two) {
-      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
-      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
-      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
-      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
      in0 = _mm_unpacklo_epi64(in0, inB0);
      in1 = _mm_unpacklo_epi64(in1, inB1);
      in2 = _mm_unpacklo_epi64(in2, inB2);
@ -102,34 +104,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
    const __m128i tmp3 = _mm_sub_epi16(a, d);

    // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
  }

  // Horizontal pass and subsequent transpose.
@ -164,34 +139,8 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

    // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
+                        &T2, &T3);
  }

  // Add inverse transform to 'dst' and store.
@ -207,10 +156,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
    } else {
      // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
@ -236,10 +185,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
    } else {
      // Store four bytes/pixels per line.
-      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
    }
  }
 }
@ -262,10 +211,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const __m128i m3 = _mm_subs_epi16(B, d4);
  const __m128i zero = _mm_setzero_si128();
  // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
  // Convert to 16b.
  dst0 = _mm_unpacklo_epi8(dst0, zero);
  dst1 = _mm_unpacklo_epi8(dst1, zero);
@ -282,10 +231,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  dst2 = _mm_packus_epi16(dst2, dst2);
  dst3 = _mm_packus_epi16(dst3, dst3);
  // Store the results.
-  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
 #undef MUL
 #endif   // USE_TRANSFORM_AC3
@ -301,11 +250,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
 static WEBP_INLINE void SignedShift8b(__m128i* const x) {
  const __m128i zero = _mm_setzero_si128();
-  const __m128i signs = _mm_cmpgt_epi8(zero, *x);
-  const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
-  const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
-  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
-  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
+  const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
+  const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
+  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3 + 8);
+  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3 + 8);
  *x = _mm_packs_epi16(lo_1, hi_1);
 }

@ -330,11 +278,10 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
  const __m128i t_2 = MM_ABS(*q1, *q0);

  const __m128i h = _mm_set1_epi8(hev_thresh);
-  const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
-  const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh
+  const __m128i t_max = _mm_max_epu8(t_1, t_2);

-  *not_hev = _mm_or_si128(t_3, t_4);
-  *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
+  const __m128i t_max_h = _mm_subs_epu8(t_max, h);
+  *not_hev = _mm_cmpeq_epi8(t_max_h, zero);  // not_hev <= t1 && not_hev <= t2
 }

 // input pixels are int8_t
@ -428,9 +375,11 @@ static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
 static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
-  const __m128i k64 = _mm_set1_epi8(0x40);
  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i k64 = _mm_set1_epi8(64);
+  const __m128i k3 = _mm_set1_epi8(3);
+  const __m128i k4 = _mm_set1_epi8(4);
  __m128i not_hev;
  __m128i t1, t2, t3;

@ -448,10 +397,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

-  t2 = _mm_set1_epi8(3);
-  t3 = _mm_set1_epi8(4);
-  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
-  t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
+  t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
@ -520,47 +467,31 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
 }

 // reads 8 rows across a vertical edge.
-//
-// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
-// two Load4x4() to avoid code duplication.
 static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
                                __m128i* const p, __m128i* const q) {
-  __m128i t1, t2;
+  // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
+  // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
+  const __m128i A0 = _mm_set_epi32(
+      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
+      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
+  const __m128i A1 = _mm_set_epi32(
+      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
+      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));

-  // Load 0th, 1st, 4th and 5th rows
-  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
-  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
-  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
-  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50
+  // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+  // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);

-  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
-  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10
-
-  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-  t1 = _mm_unpacklo_epi8(r0, r1);
-
-  // Load 2nd, 3rd, 6th and 7th rows
-  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
-  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
-  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
-  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70
-
-  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
-  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30
-
-  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-  t2 = _mm_unpacklo_epi8(r0, r1);
-
-  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-  r0 = t1;
-  t1 = _mm_unpacklo_epi16(t1, t2);
-  t2 = _mm_unpackhi_epi16(r0, t2);
+  // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+  // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+  const __m128i C0 = _mm_unpacklo_epi16(B0, B1);
+  const __m128i C1 = _mm_unpackhi_epi16(B0, B1);

  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-  *p = _mm_unpacklo_epi32(t1, t2);
-  *q = _mm_unpackhi_epi32(t1, t2);
+  *p = _mm_unpacklo_epi32(C0, C1);
+  *q = _mm_unpackhi_epi32(C0, C1);
 }

 static WEBP_INLINE void Load16x4(const uint8_t* const r0,
@ -568,7 +499,6 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
                                 int stride,
                                 __m128i* const p1, __m128i* const p0,
                                 __m128i* const q0, __m128i* const q1) {
-  __m128i t1, t2;
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
@ -587,22 +517,24 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
  Load8x4(r0, stride, p1, q0);
  Load8x4(r8, stride, p0, q1);

-  t1 = *p1;
-  t2 = *q0;
-  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-  *p1 = _mm_unpacklo_epi64(t1, *p0);
-  *p0 = _mm_unpackhi_epi64(t1, *p0);
-  *q0 = _mm_unpacklo_epi64(t2, *q1);
-  *q1 = _mm_unpackhi_epi64(t2, *q1);
+  {
+    // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+    // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+    // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+    // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+    const __m128i t1 = *p1;
+    const __m128i t2 = *q0;
+    *p1 = _mm_unpacklo_epi64(t1, *p0);
+    *p0 = _mm_unpackhi_epi64(t1, *p0);
+    *q0 = _mm_unpacklo_epi64(t2, *q1);
+    *q1 = _mm_unpackhi_epi64(t2, *q1);
+  }
 }

 static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
-    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
+    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
    *x = _mm_srli_si128(*x, 4);
  }
 }
@ -947,15 +879,308 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
 }

-#endif   // WEBP_USE_SSE2
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+
+// We use the following 8b-arithmetic tricks:
+//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
+//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static void VE4(uint8_t* dst) {    // vertical
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+  const __m128i b = _mm_subs_epu8(a, lsb);
+  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    WebPUint32ToMem(dst + i * BPS, vals);
+  }
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static void VR4(uint8_t* dst) {   // Vertical-Right
+  const __m128i one = _mm_set1_epi8(1);
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int X = dst[-1 - BPS];
+  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+  // these two are hard to implement in SSE2, so we keep the C-version:
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+}
+
+static void VL4(uint8_t* dst) {   // Vertical-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+  const __m128i abbc = _mm_or_si128(ab, bc);
+  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+  // these two are hard to get and irregular
+  DST(3, 2) = (extra_out >> 0) & 0xff;
+  DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
+  const uint32_t I = dst[-1 + 0 * BPS];
+  const uint32_t J = dst[-1 + 1 * BPS];
+  const uint32_t K = dst[-1 + 2 * BPS];
+  const uint32_t L = dst[-1 + 3 * BPS];
+  const __m128i LKJI_____ =
+      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+  const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
+  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+#undef DST
+#undef AVG3
+
+//------------------------------------------------------------------------------
+// Luma 16x16
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 4) {
+    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 4; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+    }
+  } else if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_loadu_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_storeu_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+
+static void VE16(uint8_t* dst) {
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC16(uint8_t* dst) {    // DC
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  int left = 0;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    left += dst[-1 + j * BPS];
+  }
+  {
+    const int DC = _mm_cvtsi128_si32(sum) + left + 16;
+    Put16(DC >> 5, dst);
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  const int DC = _mm_cvtsi128_si32(sum) + 8;
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  int j;
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC8uv(uint8_t* dst) {     // DC
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  const __m128i sum = _mm_sad_epu8(top, zero);
+  int left = 0;
+  int j;
+  for (j = 0; j < 8; ++j) {
+    left += dst[-1 + j * BPS];
+  }
+  {
+    const int DC = _mm_cvtsi128_si32(sum) + left + 8;
+    Put8x8uv(DC >> 4, dst);
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  const __m128i sum = _mm_sad_epu8(top, zero);
+  const int DC = _mm_cvtsi128_si32(sum) + 4;
+  Put8x8uv(DC >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
+}

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8DspInitSSE2(void);

-void VP8DspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
  VP8Transform = Transform;
 #if defined(USE_TRANSFORM_AC3)
  VP8TransformAC3 = TransformAC3;
@ -974,5 +1199,33 @@ void VP8DspInitSSE2(void) {
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif   // WEBP_USE_SSE2
+
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/dec_sse41.c
+++ b/src/dsp/dec_sse41.c
@ -0,0 +1,46 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some decoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <smmintrin.h>
+#include "../dec/vp8i.h"
+#include "../utils/utils.h"
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  const __m128i kShuffle3 = _mm_set1_epi8(3);
+  for (j = 16; j > 0; --j) {
+    const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
+    const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
+  VP8PredLuma16[3] = HE16;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -24,6 +24,8 @@
 extern "C" {
 #endif

+#define BPS 32   // this is the common stride for enc/dec
+
 //------------------------------------------------------------------------------
 // CPU detection

@ -36,20 +38,20 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif

-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif  // __clang__
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif

 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif

+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
@ -58,6 +60,10 @@ extern "C" {
 #define WEBP_USE_SSE2
 #endif

+#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#define WEBP_USE_SSE41
+#endif
+
 #if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
 #define WEBP_USE_AVX2
 #endif
@ -69,28 +75,84 @@ extern "C" {
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
 #if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
-     defined(__aarch64__)) && !defined(__native_client__)
+     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+    !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif

-#if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
+#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define WEBP_USE_MSA
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
+#endif
+
+#define WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#if !defined(WEBP_FORCE_ALIGNED) && defined(__clang__) && \
+    defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures. This is only meant to silence unaligned loads on platforms that
+// are known to support them.
+#undef WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNDEF \
+  __attribute__((no_sanitize("undefined")))
+
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures related to unsigned integer overflows. This is only meant to
+// silence cases where this well defined behavior is expected.
+#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
 #endif
 #endif

 typedef enum {
  kSSE2,
  kSSE3,
+  kSSE4_1,
  kAVX,
  kAVX2,
  kNEON,
-  kMIPS32
+  kMIPS32,
+  kMIPSdspR2,
+  kMSA
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo VP8GetCPUInfo;
+WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Init stub generator
+
+// Defines an init function stub to ensure each module exposes a symbol,
+// avoiding a compiler warning.
+#define WEBP_DSP_INIT_STUB(func) \
+  extern void func(void); \
+  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}

 //------------------------------------------------------------------------------
 // Encoding
@ -104,6 +166,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
 typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
+extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
@ -118,30 +181,98 @@ typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
 extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
 typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
                          const uint16_t* const weights);
+// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
+// 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;

 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
+extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                const struct VP8Matrix* const mtx);
+// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
+typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
+                                  const struct VP8Matrix* const mtx);
+
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
+extern VP8Quantize2Blocks VP8EncQuantize2Blocks;

 // specific to 2nd transform:
 typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
                                   const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;

-// Collect histogram for susceptibility calculation and accumulate in histo[].
-struct VP8Histogram;
+extern const int VP8DspScan[16 + 4 + 4];
+
+// Collect histogram for susceptibility calculation.
+#define MAX_COEFF_THRESH   31   // size of histogram used by CollectHistogram.
+typedef struct {
+  // We only need to store max_value and last_non_zero, not the distribution.
+  int max_value;
+  int last_non_zero;
+} VP8Histogram;
 typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
                          int start_block, int end_block,
-                          struct VP8Histogram* const histo);
-extern const int VP8DspScan[16 + 4 + 4];
+                          VP8Histogram* const histo);
 extern VP8CHisto VP8CollectHistogram;
+// General-purpose util function to help VP8CollectHistogram().
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo);

-void VP8EncDspInit(void);   // must be called before using any of the above
+// must be called before using any of the above
+void VP8EncDspInit(void);
+
+//------------------------------------------------------------------------------
+// cost functions (encoding)
+
+extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
+// approximate cost per level:
+extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
+extern const uint8_t VP8EncBands[16 + 1];
+
+struct VP8Residual;
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+                                         struct VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+// Cost calculation function.
+typedef int (*VP8GetResidualCostFunc)(int ctx0,
+                                      const struct VP8Residual* const res);
+extern VP8GetResidualCostFunc VP8GetResidualCost;
+
+// must be called before anything using the above
+void VP8EncDspCostInit(void);
+
+//------------------------------------------------------------------------------
+// SSIM utils
+
+// struct for accumulating statistical moments
+typedef struct {
+  double w;              // sum(w_i) : sum of weights
+  double xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
+  double xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
+} VP8DistoStats;
+
+#define VP8_SSIM_KERNEL 3   // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1
+typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1,
+                                             const uint8_t* src2, int stride2,
+                                             int xo, int yo,  // center position
+                                             int W, int H,    // plane dimension
+                                             VP8DistoStats* const stats);
+
+// This version is called with the guarantee that you can load 8 bytes and
+// 8 rows at offset src1 and src2
+typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1,
+                                      const uint8_t* src2, int stride2,
+                                      VP8DistoStats* const stats);
+
+extern VP8SSIMAccumulateFunc VP8SSIMAccumulate;         // unclipped / unchecked
+extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;   // with clipping
+
+// must be called before using any of the above directly
+void VP8SSIMDspInit(void);

 //------------------------------------------------------------------------------
 // Decoding
@ -159,16 +290,17 @@ extern VP8WHT VP8TransformWHT;
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
-extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];

 // clipping tables (for filtering)
 extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
 extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
 extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
 extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
-void VP8InitClipTables(void);           // must be called first
+// must be called first
+void VP8InitClipTables(void);

 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
@ -194,6 +326,15 @@ extern VP8LumaFilterFunc VP8HFilter16i;
 extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
 extern VP8ChromaFilterFunc VP8HFilter8i;

+// Dithering. Combines dithering values (centered around 128) with dst[],
+// according to: dst[] = clip(dst[] + (((dither[]-128) + 8) >> 4)
+#define VP8_DITHER_DESCALE 4
+#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
+#define VP8_DITHER_AMP_BITS 7
+#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
+extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
+                                   int dst_stride);
+
 // must be called before anything using the above
 void VP8DspInit(void);

@ -240,13 +381,81 @@ typedef void (*WebPYUV444Converter)(const uint8_t* y,
                                    const uint8_t* u, const uint8_t* v,
                                    uint8_t* dst, int len);

-extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];

 // Must be called before using the WebPUpsamplers[] (and for premultiplied
 // colorspaces like rgbA, rgbA4444, etc)
 void WebPInitUpsamplers(void);
 // Must be called before using WebPSamplers[]
 void WebPInitSamplers(void);
+// Must be called before using WebPYUV444Converters[]
+void WebPInitYUV444Converters(void);
+
+//------------------------------------------------------------------------------
+// ARGB -> YUV converters
+
+// Convert ARGB samples to luma Y.
+extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
+// even lines and '0' for odd ones. 'src_width' is the original width, not
+// the U/V one.
+extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                   int src_width, int do_store);
+
+// Convert a row of accumulated (four-values) of rgba32 toward U/V
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+                                     uint8_t* u, uint8_t* v, int width);
+
+// Convert RGB or BGR to Y
+extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+
+// used for plain-C fallback.
+extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                  int src_width, int do_store);
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+                                    uint8_t* u, uint8_t* v, int width);
+
+// Must be called before using the above.
+void WebPInitConvertARGBToYUV(void);
+
+//------------------------------------------------------------------------------
+// Rescaler
+
+struct WebPRescaler;
+
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+
+extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
+
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(struct WebPRescaler* const wrk);
+extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
+
+// Plain-C implementation, as fall-back.
+extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+
+// Main entry calls:
+extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
+                                  const uint8_t* src);
+// Export one row (starting at x_out position) from rescaler.
+extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
+
+// Must be called first before using the above.
+void WebPRescalerDspInit(void);

 //------------------------------------------------------------------------------
 // Utilities for processing transparent channel.
@ -260,6 +469,18 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
    uint8_t* rgba4444, int w, int h, int stride);

+// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
+// Returns true if alpha[] plane has non-trivial values different from 0xff.
+extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
+                                int width, int height,
+                                uint8_t* dst, int dst_stride);
+
+// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
+// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
+                                        int width, int height,
+                                        uint32_t* dst, int dst_stride);
+
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
@ -286,9 +507,61 @@ void WebPMultRows(uint8_t* ptr, int stride,
                  const uint8_t* alpha, int alpha_stride,
                  int width, int num_rows, int inverse);

+// Plain-C versions, used as fallback by some implementations.
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse);
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
+
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);

+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
+                           const uint8_t* g, const uint8_t* b, int len,
+                           uint32_t* out);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                          int len, int step, uint32_t* out);
+
+// To be called first before using the above.
+void VP8EncDspARGBInit(void);
+
+//------------------------------------------------------------------------------
+// Filter functions
+
+typedef enum {     // Filter types.
+  WEBP_FILTER_NONE = 0,
+  WEBP_FILTER_HORIZONTAL,
+  WEBP_FILTER_VERTICAL,
+  WEBP_FILTER_GRADIENT,
+  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
+  WEBP_FILTER_BEST,    // meta-types
+  WEBP_FILTER_FAST
+} WEBP_FILTER_TYPE;
+
+typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+                               int stride, uint8_t* out);
+// In-place un-filtering.
+// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
+typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,
+                                 uint8_t* cur_line, int width);
+
+// Filter the given data using the given predictor.
+// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
+// in raster order.
+// 'stride' is number of bytes per scan line (with possible padding).
+// 'out' should be pre-allocated.
+extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+
+// In-place reconstruct the original data from the given filtered data.
+// The reconstruction will be done for 'num_rows' rows starting from 'row'
+// (assuming rows upto 'row - 1' are already reconstructed).
+extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+// To be called first before using the above.
+void VP8FiltersInit(void);
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -40,10 +40,27 @@ const int VP8DspScan[16 + 4 + 4] = {
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };

+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  histo->max_value = max_value;
+  histo->last_non_zero = last_non_zero;
+}
+
 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int k;
    int16_t out[16];
@ -52,11 +69,12 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,

    // Convert coefficients to bin.
    for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
+      const int v = abs(out[k]) >> 3;
      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
-      histo->distribution[clipped_value]++;
+      ++distribution[clipped_value];
    }
  }
+  VP8SetHistogramData(distribution, histo);
 }

 //------------------------------------------------------------------------------
@ -68,7 +86,7 @@ static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;

-static void InitTables(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255 + 255; ++i) {
@ -159,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  }
 }

+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  VP8FTransform(src, ref, out);
+  VP8FTransform(src + 4, ref + 4, out + 16);
+}
+
 static void FTransformWHT(const int16_t* in, int16_t* out) {
  // input is 12b signed
  int32_t tmp[16];
@ -195,8 +218,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
 // Intra predictions

-#define DST(x, y) dst[(x) + (y) * BPS]
-
 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  int j;
  for (j = 0; j < size; ++j) {
@ -207,7 +228,7 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
 static WEBP_INLINE void VerticalPred(uint8_t* dst,
                                     const uint8_t* top, int size) {
  int j;
-  if (top) {
+  if (top != NULL) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
  } else {
    Fill(dst, 127, size);
@ -216,7 +237,7 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,

 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
                                       const uint8_t* left, int size) {
-  if (left) {
+  if (left != NULL) {
    int j;
    for (j = 0; j < size; ++j) {
      memset(dst + j * BPS, left[j], size);
@ -229,8 +250,8 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
                                   const uint8_t* top, int size) {
  int y;
-  if (left) {
-    if (top) {
+  if (left != NULL) {
+    if (top != NULL) {
      const uint8_t* const clip = clip1 + 255 - left[-1];
      for (y = 0; y < size; ++y) {
        const uint8_t* const clip_table = clip + left[y];
@ -248,7 +269,7 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
    // is equivalent to VE prediction where you just copy the top samples.
    // Note that if top samples are not available, the default value is
    // then 129, and not 127 as in the VerticalPred case.
-    if (top) {
+    if (top != NULL) {
      VerticalPred(dst, top, size);
    } else {
      Fill(dst, 129, size);
@ -261,15 +282,15 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
                               int size, int round, int shift) {
  int DC = 0;
  int j;
-  if (top) {
+  if (top != NULL) {
    for (j = 0; j < size; ++j) DC += top[j];
-    if (left) {   // top and left present
+    if (left != NULL) {   // top and left present
      for (j = 0; j < size; ++j) DC += left[j];
    } else {      // top, but no left
      DC += DC;
    }
    DC = (DC + round) >> shift;
-  } else if (left) {   // left but no top
+  } else if (left != NULL) {   // left but no top
    for (j = 0; j < size; ++j) DC += left[j];
    DC += DC;
    DC = (DC + round) >> shift;
@ -291,8 +312,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  TrueMotion(C8TM8 + dst, left, top, 8);
  // V block
  dst += 8;
-  if (top) top += 8;
-  if (left) left += 16;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
  HorizontalPred(C8HE8 + dst, left, 8);
@ -313,6 +334,7 @@ static void Intra16Preds(uint8_t* dst,
 //------------------------------------------------------------------------------
 // luma 4x4 prediction

+#define DST(x, y) dst[(x) + (y) * BPS]
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

@ -335,10 +357,10 @@ static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }

 static void DC4(uint8_t* dst, const uint8_t* top) {
@ -537,6 +559,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {

 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
 static int TTransform(const uint8_t* in, const uint16_t* w) {
  int sum = 0;
  int tmp[16];
@ -614,7 +637,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
      int level = QUANTDIV(coeff, iQ, B);
      if (level > MAX_LEVEL) level = MAX_LEVEL;
      if (sign) level = -level;
-      in[j] = level * Q;
+      in[j] = level * (int)Q;
      out[n] = level;
      if (level) last = n;
    } else {
@ -625,6 +648,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                            const VP8Matrix* const mtx) {
  int n, last = -1;
@ -640,7 +671,7 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
      int level = QUANTDIV(coeff, iQ, B);
      if (level > MAX_LEVEL) level = MAX_LEVEL;
      if (sign) level = -level;
-      in[j] = level * Q;
+      in[j] = level * (int)Q;
      out[n] = level;
      if (level) last = n;
    } else {
@ -654,16 +685,84 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
 //------------------------------------------------------------------------------
 // Block copy

-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  int y;
-  for (y = 0; y < size; ++y) {
-    memcpy(dst, src, size);
+  for (y = 0; y < h; ++y) {
+    memcpy(dst, src, w);
    src += BPS;
    dst += BPS;
  }
 }

-static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 16, 8);
+}
+
+//------------------------------------------------------------------------------
+
+static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
+                                  const uint8_t* src2, int stride2,
+                                  int xo, int yo, int W, int H,
+                                  VP8DistoStats* const stats) {
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const int s1 = src1[x];
+      const int s2 = src2[x];
+      stats->w   += 1;
+      stats->xm  += s1;
+      stats->ym  += s2;
+      stats->xxm += s1 * s1;
+      stats->xym += s1 * s2;
+      stats->yym += s2 * s2;
+    }
+  }
+}
+
+static void SSIMAccumulate(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2,
+                           VP8DistoStats* const stats) {
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const int s1 = src1[x];
+      const int s2 = src2[x];
+      stats->w   += 1;
+      stats->xm  += s1;
+      stats->ym  += s2;
+      stats->xxm += s1 * s1;
+      stats->xym += s1 * s2;
+      stats->yym += s2 * s2;
+    }
+  }
+}
+
+VP8SSIMAccumulateFunc VP8SSIMAccumulate;
+VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+    (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8SSIMAccumulate = SSIMAccumulate;
+  VP8SSIMAccumulateClipped = SSIMAccumulateClipped;
+
+  ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}

 //------------------------------------------------------------------------------
 // Initialization
@ -673,6 +772,7 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 VP8CHisto VP8CollectHistogram;
 VP8Idct VP8ITransform;
 VP8Fdct VP8FTransform;
+VP8Fdct VP8FTransform2;
 VP8WHT VP8FTransformWHT;
 VP8Intra4Preds VP8EncPredLuma4;
 VP8IntraPreds VP8EncPredLuma16;
@ -684,18 +784,22 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;

 extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
 extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);

 static volatile VP8CPUInfo enc_last_cpuinfo_used =
    (VP8CPUInfo)&enc_last_cpuinfo_used;

-void VP8EncDspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;

  VP8DspInit();  // common inverse transforms
@ -705,6 +809,7 @@ void VP8EncDspInit(void) {
  VP8CollectHistogram = CollectHistogram;
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
+  VP8FTransform2 = FTransform2;
  VP8FTransformWHT = FTransformWHT;
  VP8EncPredLuma4 = Intra4Preds;
  VP8EncPredLuma16 = Intra16Preds;
@ -716,14 +821,21 @@ void VP8EncDspInit(void) {
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8Copy4x4 = Copy4x4;
+  VP8Copy16x8 = Copy16x8;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8EncDspInitSSE41();
+      }
+#endif
    }
 #endif
 #if defined(WEBP_USE_AVX2)
@ -740,8 +852,12 @@ void VP8EncDspInit(void) {
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspInitMIPSdspR2();
+    }
 #endif
  }
  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
-
--- a/src/dsp/enc_avx2.c
+++ b/src/dsp/enc_avx2.c
@ -18,7 +18,4 @@
 //------------------------------------------------------------------------------
 // Entry point

-extern void VP8EncDspInitAVX2(void);
-
-void VP8EncDspInitAVX2(void) {
-}
+WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2)
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -17,13 +17,10 @@

 #if defined(WEBP_USE_MIPS32)

+#include "./mips_macro.h"
 #include "../enc/vp8enci.h"
 #include "../enc/cost.h"

-#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
-#define WORK_AROUND_GCC
-#endif
-
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;

@ -34,86 +31,86 @@ static const int kC2 = 35468;
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
 #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
-  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
-  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
-  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
-  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
-  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
-  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
-  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
-  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
-  "sra     %[temp18],      %[temp18],      16              \n\n"            \
-  "sra     %[temp17],      %[temp17],      16              \n\n"            \
-  "sra     %[temp19],      %[temp19],      16              \n\n"            \
-  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
-  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
-  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"

 // macro for one horizontal pass in ITransformOne
 // MUL and STORE macros inlined
 // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from ref and store to dst buffer
+// A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
-  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
-  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
-  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
-  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
-  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
-  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                      \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
-  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
-  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
-  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
-  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
-  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
-  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
-  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255             \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
-  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                      \n\t"            \
-  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \
+  "lw      %[temp20],      0(%[args])                          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \
+  "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \
+  "addiu   %[temp20],      $zero,          255                 \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \
+  "lw      %[temp16],      8(%[args])                          \n\t"          \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \
+  "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"

 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@ -130,10 +127,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)

-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
+    HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)

    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@ -164,9 +161,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 #define QUANTIZE_ONE(J, K, N)                                               \
-  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
  "sra          %[sign],        %[temp0],           15              \n\t"   \
  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
@ -175,9 +172,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "addiu        %[temp5],       $zero,              0               \n\t"   \
  "addiu        %[level],       $zero,              0               \n\t"   \
  "beqz         %[temp4],       2f                                  \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
-  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
  "sra          %[level],       %[level],           17              \n\t"   \
@ -187,8 +184,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "subu         %[level],       %[level],           %[sign]         \n\t"   \
  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
 "2:                                                                 \n\t"   \
-  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
-  "sh           %[level],       "#N"(%[pout])                       \n\t"
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"

 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
@ -241,46 +238,54 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return 0;
 }

+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 #undef QUANTIZE_ONE

 // macro for one horizontal pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
-// A..D - offsets in bytes to load from a and b buffers
+// A - offset in bytes to load from a and b buffers
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
-#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
-  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
-  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
-  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
-  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
-  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
-  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
-  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
-  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
-  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
-  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
-  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
-  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
-  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
-  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
-  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
-  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
-  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
-  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
-  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
-  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
-  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
-  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \
+  "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \
+  "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])               \n\t"

 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@ -295,10 +300,10 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // A1..D1 - offsets in bytes to load second results from tmp buffer
 // E..H - offsets in bytes to load from w buffer
 #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
-  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
-  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
-  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
-  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -319,18 +324,18 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
-  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
-  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
-  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
-  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
  "madd   %[temp4],  %[temp3]                \n\t"                \
  "madd   %[temp5],  %[temp1]                \n\t"                \
  "madd   %[temp6],  %[temp0]                \n\t"                \
  "madd   %[temp7],  %[temp8]                \n\t"                \
-  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
-  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
-  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
-  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -362,10 +367,10 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

  __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
-    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
-    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
-    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
+    HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124)
    "mthi   $zero                             \n\t"
    "mtlo   $zero                             \n\t"
    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
@ -405,73 +410,73 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

 // macro for one horizontal pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from src and ref buffers
+// A - offset in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
-  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
-  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
-  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
-  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
-  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
-  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
-  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
-  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
-  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
-  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \
+  "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \
+  "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \
+  "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \
+  "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                      \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t"

 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to store to out buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
-  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
-  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
-  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
-  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
-  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
-  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
-  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
-  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
-  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
-  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
-  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
-  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
-  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -483,10 +488,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
      { (const int*)src, (const int*)ref, (const int*)out };

  __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
    "lw   %[temp20],    8(%[args])                     \n\t"
    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
@ -508,128 +513,17 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS

-// Forward declaration.
-extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
-
-int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  int cost;
-  const int const_2 = 2;
-  const int const_255 = 255;
-  const int const_max_level = MAX_VARIABLE_LEVEL;
-  int res_cost;
-  int res_prob;
-  int res_coeffs;
-  int res_last;
-  int v_reg;
-  int b_reg;
-  int ctx_reg;
-  int cost_add, temp_1, temp_2, temp_3;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-
-  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-
-  res_cost = (int)res->cost;
-  res_prob = (int)res->prob;
-  res_coeffs = (int)res->coeffs;
-  res_last = (int)res->last;
-
-  __asm__ volatile(
-    ".set   push                                                           \n\t"
-    ".set   noreorder                                                      \n\t"
-
-    "sll    %[temp_1],     %[n],              1                            \n\t"
-    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
-    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_2],     1f                                              \n\t"
-    " li    %[cost_add],   0                                               \n\t"
-    "b      2f                                                             \n\t"
-    " nop                                                                  \n\t"
-  "1:                                                                      \n\t"
-    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
-    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
-    "move   %[temp_1],     %[const_max_level]                              \n\t"
-    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
-    "negu   %[temp_2],     %[v_reg]                                        \n\t"
-    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
-    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
-    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
-    "li     %[cost_add],   0                                               \n\t"
-
-    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
-    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
-    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
-    //  cost += VP8LevelCost(t, v);
-    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
-    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
-    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
-    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
-    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
-    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
-    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
-
-    //  t = res->cost[b][ctx];
-    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
-    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
-    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
-    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
-    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
-    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
-    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
-    "addiu  %[n],          %[n],              1                            \n\t"
-    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
-    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_1],     1b                                              \n\t"
-    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
-   "2:                                                                     \n\t"
-
-    ".set   pop                                                            \n\t"
-    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
-      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
-      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
-    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
-      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
-      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
-      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
-      [res_cost]"r"(res_cost)
-    : "memory"
-  );
-
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
+#if !defined(WORK_AROUND_GCC)

 #define GET_SSE_INNER(A, B, C, D)                               \
-  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
-  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
-  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
-  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
-  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
-  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
-  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
-  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
@ -645,7 +539,6 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)

-#if !defined(WORK_AROUND_GCC)
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -653,29 +546,29 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
-     GET_SSE(128, 132, 136, 140)
-     GET_SSE(144, 148, 152, 156)
-     GET_SSE(160, 164, 168, 172)
-     GET_SSE(176, 180, 184, 188)
-     GET_SSE(192, 196, 200, 204)
-     GET_SSE(208, 212, 216, 220)
-     GET_SSE(224, 228, 232, 236)
-     GET_SSE(240, 244, 248, 252)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }
@ -687,21 +580,21 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }
@ -713,17 +606,17 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE( 0,   4,  16,  20)
-     GET_SSE(32,  36,  48,  52)
-     GET_SSE(64,  68,  80,  84)
-     GET_SSE(96, 100, 112, 116)
+     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }
@ -735,42 +628,45 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  __asm__ volatile(
     "mult   $zero,    $zero                            \n\t"

-     GET_SSE(0, 16, 32, 48)
+     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)

    "mflo    %[count]                                   \n\t"
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
    : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
  );
  return count;
 }

-#endif  // WORK_AROUND_GCC
+#undef GET_SSE
+#undef GET_SSE_INNER

-#undef GET_SSE_MIPS32
-#undef GET_SSE_MIPS32_INNER
-
-#endif  // WEBP_USE_MIPS32
+#endif  // !WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Entry point

 extern void VP8EncDspInitMIPS32(void);

-void VP8EncDspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
-  VP8FTransform = FTransform;
 #if !defined(WORK_AROUND_GCC)
  VP8SSE16x16 = SSE16x16;
  VP8SSE8x8 = SSE8x8;
  VP8SSE16x8 = SSE16x8;
  VP8SSE4x4 = SSE4x4;
 #endif
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
--- a/Show More
+++ b/Show More