update ChangeLog

Change-Id: Ia1e4669e6270faa6daae6306f47baa31488f119d
vwebp: disable double buffering on windows & mac
2025-07-15 05:19:48 +02:00 · 2017-11-25 19:01:58 -08:00 · 2017-11-25 18:22:39 -08:00 · 2017-11-25 13:52:03 -08:00 · 2017-11-24 22:40:15 -08:00 · 2017-11-24 14:21:05 -08:00
206 changed files with 4852 additions and 5435 deletions
--- a/Android.mk
+++ b/Android.mk
@ -55,9 +55,6 @@ dsp_dec_srcs := \
    src/dsp/alpha_processing_neon.$(NEON) \
    src/dsp/alpha_processing_sse2.c \
    src/dsp/alpha_processing_sse41.c \
-    src/dsp/argb.c \
-    src/dsp/argb_mips_dsp_r2.c \
-    src/dsp/argb_sse2.c \
    src/dsp/cpu.c \
    src/dsp/dec.c \
    src/dsp/dec_clip_tables.c \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,17 +4,17 @@ project(libwebp C)

 # Options for coder / decoder executables.
 option(WEBP_ENABLE_SIMD "Enable any SIMD optimization." ON)
-option(WEBP_ENABLE_WASM "Enable WebAssembly optimizations." OFF)
 option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." OFF)
 option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." OFF)
 option(WEBP_BUILD_GIF2WEBP "Build the gif2webp conversion tool." OFF)
 option(WEBP_BUILD_IMG2WEBP "Build the img2webp animation tool." OFF)
 option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." OFF)
 option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
+option(WEBP_ENABLE_NEAR_LOSSLESS "Enable near-lossless encoding" ON)
 option(WEBP_EXPERIMENTAL_FEATURES "Build with experimental features." OFF)
 option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces." OFF)

-if(WEBP_BUILD_WEBP_JS OR WEBP_ENABLE_WASM)
+if(WEBP_BUILD_WEBP_JS)
  set(WEBP_ENABLE_SIMD OFF)
 endif()

@ -27,19 +27,13 @@ if(NOT CMAKE_BUILD_TYPE)
  )
 endif()

-include(cmake/config.h.cmake)
-
-# Extract the version of the library.
-file(READ ${CMAKE_CURRENT_SOURCE_DIR}/configure.ac SOURCE_FILE)
-string(REGEX MATCH "[0-9.]+" WEBP_VERSION ${SOURCE_FILE})
+# Include dependencies.
+include(cmake/deps.cmake)

 ################################################################################
 # Options.
 if(WEBP_ENABLE_SWAP_16BIT_CSP)
-  add_definitions(-DWEBP_SWAP_16BIT_CSP)
-endif()
-if(WEBP_ENABLE_WASM)
-  add_definitions(-DWEBP_USE_WASM)
+  add_definitions(-DWEBP_SWAP_16BIT_CSP=1)
 endif()

 ################################################################################
@ -54,7 +48,10 @@ if(ANDROID)
  set(WEBP_DEP_INCLUDE_DIRS ${WEBP_DEP_INCLUDE_DIRS}
    ${ANDROID_NDK}/sources/android/cpufeatures
  )
-  add_definitions(-DHAVE_CPU_FEATURES_H)
+  add_definitions(-DHAVE_CPU_FEATURES_H=1)
+  set(HAVE_CPU_FEATURES_H 1)
+else()
+  set(HAVE_CPU_FEATURES_H 0)
 endif()

 ################################################################################
@ -106,8 +103,13 @@ endforeach()

 ### Define the mandatory libraries.
 # Build the webpdecoder library.
-add_definitions(-Wall)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ ${WEBP_DEP_INCLUDE_DIRS})
+if(MSVC)
+  # avoid security warnings for e.g., fopen() used in the examples.
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+else()
+  add_definitions(-Wall)
+endif()
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${WEBP_DEP_INCLUDE_DIRS})
 add_library(webpdecode OBJECT ${WEBP_DEC_SRCS})
 add_library(webpdspdecode OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS})
 add_library(webputilsdecode OBJECT ${WEBP_UTILS_COMMON_SRCS}
@ -145,13 +147,13 @@ function(parse_version FILE NAME VAR)
  set(${VAR} "${VERSION}" PARENT_SCOPE)
 endfunction()
 parse_version(Makefile.am webp WEBP_WEBP_SOVERSION)
-set_target_properties(webp PROPERTIES VERSION ${WEBP_VERSION}
+set_target_properties(webp PROPERTIES VERSION ${PACKAGE_VERSION}
  SOVERSION ${WEBP_WEBP_SOVERSION})
 parse_version(Makefile.am webpdecoder WEBP_DECODER_SOVERSION)
-set_target_properties(webpdecoder PROPERTIES VERSION ${WEBP_VERSION}
+set_target_properties(webpdecoder PROPERTIES VERSION ${PACKAGE_VERSION}
  SOVERSION ${WEBP_DECODER_SOVERSION})
 parse_version(demux/Makefile.am webpdemux WEBP_DEMUX_SOVERSION)
-set_target_properties(webpdemux PROPERTIES VERSION ${WEBP_VERSION}
+set_target_properties(webpdemux PROPERTIES VERSION ${PACKAGE_VERSION}
  SOVERSION ${WEBP_DEMUX_SOVERSION})

 # Define the libraries to install.
@ -167,11 +169,9 @@ math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
 foreach(I_FILE RANGE ${WEBP_SIMD_FILES_TO_INCLUDE_RANGE})
  list(GET WEBP_SIMD_FILES_TO_INCLUDE ${I_FILE} FILE)
  list(GET WEBP_SIMD_FLAGS_TO_INCLUDE ${I_FILE} SIMD_COMPILE_FLAG)
-  if(NOT ${SIMD_COMPILE_FLAG} STREQUAL "NOTFOUND")
-    set_source_files_properties(${FILE} PROPERTIES
-      COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
-    )
-  endif()
+  set_source_files_properties(${FILE} PROPERTIES
+    COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
+  )
 endforeach()

 # Build the executables if asked for.
@ -200,6 +200,10 @@ if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP OR
    "imageenc_[^ ]*")
  add_library(imageenc ${IMAGEENC_SRCS})
  target_link_libraries(imageenc webp)
+
+  set_property(TARGET exampleutil imageioutil imagedec imageenc
+    PROPERTY INCLUDE_DIRECTORIES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_DWEBP)
@ -210,6 +214,8 @@ if(WEBP_BUILD_DWEBP)
  add_executable(dwebp ${DWEBP_SRCS})
  target_link_libraries(dwebp exampleutil imagedec imageenc webpdecoder)
  install(TARGETS dwebp RUNTIME DESTINATION bin)
+  set_property(TARGET dwebp PROPERTY INCLUDE_DIRECTORIES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_CWEBP)
@ -220,6 +226,12 @@ if(WEBP_BUILD_CWEBP)
  add_executable(cwebp ${CWEBP_SRCS})
  target_link_libraries(cwebp exampleutil imagedec webp)
  install(TARGETS cwebp RUNTIME DESTINATION bin)
+  set_property(TARGET cwebp PROPERTY INCLUDE_DIRECTORIES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
+endif()
+
+if(WEBP_BUILD_GIF2WEBP AND NOT GIF_FOUND)
+  unset(WEBP_BUILD_GIF2WEBP CACHE)
 endif()

 if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
@ -228,7 +240,7 @@ if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
  add_library(webpmux ${WEBP_MUX_SRCS})
  target_link_libraries(webpmux webp)
  parse_version(mux/Makefile.am webpmux WEBP_MUX_SOVERSION)
-  set_target_properties(webpmux PROPERTIES VERSION ${WEBP_VERSION}
+  set_target_properties(webpmux PROPERTIES VERSION ${PACKAGE_VERSION}
    SOVERSION ${WEBP_MUX_SOVERSION})
  list(APPEND INSTALLED_LIBRARIES webpmux)
 endif()
@ -242,6 +254,8 @@ if(WEBP_BUILD_GIF2WEBP)
  target_link_libraries(gif2webp exampleutil imageioutil webp webpmux
    ${WEBP_DEP_GIF_LIBRARIES})
  install(TARGETS gif2webp RUNTIME DESTINATION bin)
+  set_property(TARGET gif2webp PROPERTY INCLUDE_DIRECTORIES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_IMG2WEBP)
@ -252,6 +266,8 @@ if(WEBP_BUILD_IMG2WEBP)
  add_executable(img2webp ${IMG2WEBP_SRCS})
  target_link_libraries(img2webp exampleutil imagedec imageioutil webp webpmux)
  install(TARGETS img2webp RUNTIME DESTINATION bin)
+  set_property(TARGET img2webp PROPERTY INCLUDE_DIRECTORIES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if (WEBP_BUILD_WEBPINFO)
@ -262,6 +278,8 @@ if (WEBP_BUILD_WEBPINFO)
  add_executable(webpinfo ${WEBPINFO_SRCS})
  target_link_libraries(webpinfo exampleutil imageioutil)
  install(TARGETS webpinfo RUNTIME DESTINATION bin)
+  set_property(TARGET webpinfo PROPERTY INCLUDE_DIRECTORIES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_WEBP_JS)
@ -269,6 +287,7 @@ if(WEBP_BUILD_WEBP_JS)
  add_executable(webp_js
                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
  target_link_libraries(webp_js webpdecoder SDL)
+  set(WEBP_HAVE_SDL 1)
  set_target_properties(webp_js PROPERTIES LINK_FLAGS
      "-s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
  set_target_properties(webp_js PROPERTIES OUTPUT_NAME webp)
@ -286,6 +305,14 @@ if(WEBP_BUILD_WEBP_JS)
  target_compile_definitions(webpdecoder PUBLIC EMSCRIPTEN)
 endif()

+# Generate the config.h file.
+configure_file(${CMAKE_CURRENT_LIST_DIR}/cmake/config.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/src/webp/config.h)
+add_definitions(-DHAVE_CONFIG_H)
+# The webp folder is included as we reference config.h as
+# ../webp/config.h or webp/config.h
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
 # Install the different headers and libraries.
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h
              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h
@ -302,7 +329,7 @@ install(TARGETS ${INSTALLED_LIBRARIES}
 include(CMakePackageConfigHelpers)
 write_basic_package_version_file(
  "${CMAKE_CURRENT_BINARY_DIR}/WebPConfigVersion.cmake"
-  VERSION ${WEBP_VERSION}
+  VERSION ${PACKAGE_VERSION}
  COMPATIBILITY AnyNewerVersion
 )

--- a/283
+++ b/283
@ -1,9 +1,292 @@
+c10a493c vwebp: disable double buffering on windows & mac
+0d4466c2 webp_to_sdl.c: fix file mode
+1b27bf8b WEBP_REDUCE_SIZE: disable all rescaler code
+126be109 webpinfo: add -version option
+9add62b5 bump version to 0.6.1
+d3e26144 update NEWS
+2edda639 README: add webpinfo section
+9ca568ef Merge "right-size some tables"
+31f1995c Merge "SSE2 implementation of HasAlphaXXX"
+a80c46bd SSE2 implementation of HasAlphaXXX
+083507f2 right-size some tables
+2e5785b2 anim_utils.c: remove warning when !defined(WEBP_HAVE_GIF)
+b299c47e add WEBP_REDUCE_SIZE
+f593d71a enc: disable pic->stats/extra_info w/WEBP_DISABLE_STATS
+541179a9 Merge "predictor_enc: fix build w/--disable-near-lossless"
+5755a7ec predictor_enc: fix build w/--disable-near-lossless
+eab5bab7 add WEBP_DISABLE_STATS
+8052c585 remove some petty TODOs from vwebp.
+c245343d move LOAD8x4 and STORE8x2 closer to their use location
+b9e734fd dec,cosmetics: normalize function naming style
+c188d546 dec: harmonize function suffixes
+28c5ac81 dec_sse41: harmonize function suffixes
+e65b72a3 Merge "introduce WebPHasAlpha8b and WebPHasAlpha32b"
+b94cee98 dec_sse2: remove HE8uv_SSE2
+44a0ee3f introduce WebPHasAlpha8b and WebPHasAlpha32b
+aebf59ac Merge "WebPPictureAllocARGB: align argb allocation"
+c184665e WebPPictureAllocARGB: align argb allocation
+3daf7509 WebPParseHeaders: remove obsolete animation TODO
+80285d97 cmake: avoid security warnings under msvc
+650eac55 cmake: don't set -Wall with MSVC
+c462cd00 Remove useless code.
+01a98217 Merge "remove WebPWorkerImpl declaration from the header"
+3c49fc47 Merge "thread_utils: fix potentially bad call to Execute"
+fde2782e thread_utils: fix potentially bad call to Execute
+2a270c1d remove WebPWorkerImpl declaration from the header
+f1f437cc remove mention of 'lossy-only parameters' from the doc
+3879074d Merge "WebPMemToUint32: remove ptr cast to int"
+04b029d2 WebPMemToUint32: remove ptr cast to int
+b7971d0e dsp: avoid defining _C functions w/NEON builds
+6ba98764 webpdec: correct alloc size check w/use_argb
+5cfb3b0f normalize include guards
+f433205e Merge changes Ia17c7dfc,I75423abb,Ia2f716b4,I161caa14,I4210081a, ...
+8d033b14 {dec,enc}_neon: harmonize function suffixes x2
+0295e981 upsampling_neon: harmonize function suffixes
+d572c4e5 yuv_neon: harmonize function suffixes
+ab9c2500 rescaler_neon: harmonize function suffixes
+93e0ce27 lossless_neon: harmonize function suffixes
+22fbc50e lossless_enc_neon: harmonize function suffixes
+447875b4 filters_neon,cosmetics: fix indent
+e51bdd43 remove unused VP8TokenToStats() function
+785da7ea enc_neon: harmonize function suffixes
+bc1a251f dec_neon: harmonize function suffixes
+61e535f1 dsp/lossless: workaround gcc-4.8 bug on arm
+68b2eab7 cwebp: fix alpha reporting w/lossless & metadata
+30042faa WebPDemuxGetI: add doc details around WebPFormatFeature
+0a17f471 Merge "WIP: list includes as descendants of the project dir"
+a4399721 WIP: list includes as descendants of the project dir
+08275708 Merge "Make sure we reach the full range for alpha blending."
+d361a6a7 yuv_sse2: harmonize function suffixes
+6921aa6f upsampling_sse2: harmonize function suffixes
+08c67d3e ssim_sse2: harmonize function suffixes
+582a1b57 rescaler_sse2: harmonize function suffixes
+2c1b18ba lossless_sse2: harmonize function suffixes
+0ac46e81 lossless_enc_sse2: harmonize function suffixes
+bc634d57 enc_sse2: harmonize function suffixes
+bcb7347c dec_sse2: harmonize function suffixes
+e14ad93c Make sure we reach the full range for alpha blending.
+7038ca8d demux,StoreFrame: restore hdr size check to min req
+fb3daad6 cpu: fix ssse3 check
+be590e06 Merge "Fix CMake redefinition for HAVE_CPU_FEATURES_H"
+35f736e1 Fix CMake redefinition for HAVE_CPU_FEATURES_H
+a5216efc Fix integer overflow warning.
+a9c8916b decode.h,WebPIDecGetRGB: clarify output ptr validity
+3c74c645 gif2webp: handle 1-frame case properly + fix anim_diff
+c7f295d3 Merge "gif2webp: introduce -loop_compatibility option"
+b4e04677 gif2webp: introduce -loop_compatibility option
+f78da3de add LOCAL_CLANG_PREREQ and avoid WORK_AROUND_GCC w/3.8+
+01c426f1 define WEBP_USE_INTRINSICS w/gcc-4.9+
+8635973d use sdl-config (if available) to determine the link flags
+e9459382 use CPPFLAGS before CFLAGS
+4a9d788e Merge "Android.mk,mips: fix clang build with r15"
+4fbdc9fb Android.mk,mips: fix clang build with r15
+a80fcc4a ifdef code not used by Chrome/Android.
+3993af12 Fix signed integer overflows.
+f66f94ef anim_dump: small tool to dump frames from animated WebP
+6eba857b Merge "rationalize the Makefile.am"
+c5e34fba function definition cleanup
+3822762a rationalize the Makefile.am
+501ef6e4 configure style fix: animdiff -> anim_diff
+f8bdc268 Merge "protect against NULL dump_folder[] value in ReadAnimatedImage()"
+23bfc652 protect against NULL dump_folder[] value in ReadAnimatedImage()
+8dc3d71b cosmetics,ReadAnimatedWebP: correct function comment
+5bd40066 Merge changes I66a64a0a,I4d2e520f
+7945575c cosmetics,webpinfo: remove an else after a return
+8729fa11 cosmetics,cwebp: remove an else after a return
+f324b7f9 cosmetics: normalize fn proto & decl param names
+869eb369 CMake cleanups.
+289e62a3 Remove declaration of unimplemented VP8ApplyNearLosslessPredict
+20a94186 pnmdec,PAM: validate depth before calculating bytes_per_px
+34130afe anim_encode: fix integer overflow
+42c79aa6 Merge "Encoder: harmonize function suffixes"
+b09307dc Encoder: harmonize function suffixes
+bed0456d Merge "SSIM: harmonize the function suffix"
+54f6a3cf lossless_sse2.c: fix some missed suffix changes
+088f1dcc SSIM: harmonize the function suffix
+86fc4dd9 webpdec: use ImgIoUtilCheckSizeArgumentsOverflow
+08ea9ecd imageio: add ability restrict max image size
+6f9daa4a jpegdec,ReadError: fix leaks on error
+a0f72a4f VP8LTransformColorFunc: drop an non-respected 'const' from the signature.
+8c934902 Merge "Lossess dec: harmonize the function suffixes"
+622242aa Lossess dec: harmonize the function suffixes
+1411f027 Lossless Enc: harmonize the function suffixes
+24ad2e3c add const to two variables
+46efe062 Merge "Allow the lossless cruncher to work for alpha."
+8c3f9a47 Speed-up LZ77.
+1aef4c71 Allow the lossless cruncher to work for alpha.
+b8821dbd Improve the box LZ77 speed.
+7beed280 add missing ()s to macro parameters
+6473d20b Merge "fix Android standalone toolchain build"
+dcefed95 Merge "build.gradle: fix arm64 build"
+0c83a8bc Merge "yuv: harmonize suffix naming"
+c6d1db4b fix Android standalone toolchain build
+663a6d9d unify the ALTERNATE_CODE flag usage
+73ea9f27 yuv: harmonize suffix naming
+c71b68ac build.gradle: fix arm64 build
+c4568b47 Rescaler: harmonize the suffix naming
+6cb13b05 Merge "alpha_processing: harmonize the naming suffixes to be _C()"
+83a3e69a Merge "simplify WEBP_EXTERN macro"
+7295fde2 Merge "filters: harmonize the suffixes naming to _SSE2(), _C(), etc."
+8e42ba4c simplify WEBP_EXTERN macro
+331ab34b cost*.c: harmonize the suffix namings
+b161f670 filters: harmonize the suffixes naming to _SSE2(), _C(), etc.
+dec5e4d3 alpha_processing: harmonize the naming suffixes to be _C()
+6878d427 fix memory leak in SDL_Init()
+461ae555 Merge "configure: fix warnings in sdl check"
+62486a22 configure: test for -Wundef
+92982609 dsp.h: fix -Wundef w/__mips_dsp_rev
+0265cede configure: fix warnings in sdl check
+88c73d8a backward_references_enc.h: fix WINDOW_SIZE_BITS check
+4ea49f6b rescaler_sse2.c: fix WEBP_RESCALER_FIX -> _RFIX typo
+1b526638 Clean-up some CMake
+87f57a4b Merge "cmake: fix gif lib detection when cross compiling"
+b34a9db1 cosmetics,dec_sse2: remove some redundant comments
+471c5755 cmake: fix gif lib detection when cross compiling
+c793417a cmake: disable gif2webp if gif lib isn't found
+dcbc1c88 cmake: split gif detection from IMG deps
+66ad84f0 Merge "muxread: remove unreachable code"
+50ec3ab7 muxread: remove unreachable code
+7d67a164 Lossy encoding: smoothen transparent areas to improve compression
+e50650c7 Merge "fix signature for DISABLE_TOKEN_BUFFER compilation"
+671d2567 fix signature for DISABLE_TOKEN_BUFFER compilation
+d6755580 cpu.cmake: use unique flag to test simd disable flags
+28914528 Merge "Remove the argb* files."
+8acb4942 Remove the argb* files.
+3b62347b README: correct cmake invocation note
+7ca0df13 Have the SSE2 version of PackARGB use common code.
+7b250459 Merge "Re-use the transformed image when trying several LZ77 in lossless."
+e132072f Re-use the transformed image when trying several LZ77 in lossless.
+5d7a50ef Get code to compile in C++.
+7b012987 configure: test for -Wparentheses-equality
+f0569adb Fix man pages for multi-threading.
+f1d5a397 multithread cruncher: only copy stats when picture->stats != NULL
+f8c2ac15 Multi-thread the lossless cruncher.
+a88c6522 Merge "Integrate a new LZ77 looking for matches in the neighborhood of a pixel only."
+8f6df1d0 Unroll Predictors 10, 11 and 12.
+355c3d1b Integrate a new LZ77 looking for matches in the neighborhood of a pixel only.
+a1779a01 Refactor LZ77 handling in preparation for a new method.
+67de68b5 Android.mk/build.gradle: fix mips build with clang from r14b
+f209a548 Use the plane code and not the distance when computing statistics.
+b903b80c Split cost-based backward references in its own file.
+498cad34 Cosmetic changes in backward reference.
+e4eb4587 lossless, VP8LTransformColor_C: make sure no overflow happens with colors.
+af6deaff webpinfo: handle alpha flag mismatch
+7caef29b Fix typo that creeped in.
+39e19f92 Merge "near lossless: fix unsigned int overflow warnings."
+9bbc0891 near lossless: fix unsigned int overflow warnings.
+e1118d62 Merge "cosmetics,FindClosestDiscretized: use uint in mask creation"
+186bc9b7 Merge "webpinfo: tolerate ALPH+VP8L"
+b5887297 cosmetics,FindClosestDiscretized: use uint in mask creation
+f1784aee near_lossless,FindClosestDiscretized: use unsigned ops
+0d20abb3 webpinfo: tolerate ALPH+VP8L
+972104b3 webpmux: tolerate false positive Alpha flag
+dd7e83cc tiffdec,ReadTIFF: ensure data_size is < tsize_t max
+d988eb7b tiffdec,MyRead: quiet -Wshorten-64-to-32 warning
+dabda707 webpinfo: add support to parse Alpha bitstream
+4c117643 webpinfo: correct background color output, BGRA->ARGB
+defc98d7 Doc: clarify the role of quality in WebPConfig.
+d78ff780 Merge "Fix code to compile with C++."
+c8f14093 Fix code to compile with C++.
+497dc6a7 pnmdec: sanitize invalid header output
+d78e5867 Merge "configure: test for -Wconstant-conversion"
+481e91eb Merge "pnmdec,PAM: set bytes_per_px based on depth when missing"
+93b12753 configure: test for -Wconstant-conversion
+645f0c53 pnmdec,PAM: set bytes_per_px based on depth when missing
+e9154605 Merge "vwebp: activate GLUT double-buffering"
+818d795b vwebp: activate GLUT double-buffering
+d63e6f4b Add a man page for webpinfo
+4d708435 Merge "NEON: implement ConvertRGB24ToY/BGR24/ARGB/RGBA32ToUV/ARGBToUV"
+faf42213 NEON: implement ConvertRGB24ToY/BGR24/ARGB/RGBA32ToUV/ARGBToUV
+b4d576fa Install man pages with CMake.
+cbc1b921 webpinfo: add features to parse bitstream header
+e644c556 Fix bad bit writer initialization.
+b62cdad2 Merge "Implement a cruncher for lossless at method 6."
+da3e4dfb use the exact constant for the gamma transfer function
+a9c701e0 Merge "tiffdec: fix EXTRASAMPLES check"
+adab8ce0 Implement a cruncher for lossless at method 6.
+1b92b237 Merge "Fix VP8ApplyNearLossless to respect const and stride."
+1923ff02 tiffdec: fix EXTRASAMPLES check
+97cce5ba tiffdec: only request EXTRASAMPLES w/> 3 samples/px
+0dcd85b6 Fix VP8ApplyNearLossless to respect const and stride.
+f7682189 yuv: rationalize the C/SSE2 function naming
+52245424 NEON implementation of some Sharp-YUV420 functions
+690efd82 Avoid several backward reference copies.
+4bb1f607 src/dec/vp8_dec.h, cosmetics: fix comments
+285748be cmake: build/install webpinfo
+78fd199c backward_references_enc.c: clear -Wshadow warnings
+ae836410 WebPLog2FloorC: clear -Wshadow warning
+d0b7404e Merge "WASM support"
+134e314f WASM support
+c08adb6f Merge "VP8LEnc: remove use of BitsLog2Ceiling()"
+28c37ebd VP8LEnc: remove use of BitsLog2Ceiling()
+2cb58ab2 webpinfo: output format as a human readable string
+bb175a93 Merge "rename some symbols clashing with MSVC headers"
+39eda658 Remove a duplicated pixel hash implementation.
+36b8274d rename some symbols clashing with MSVC headers
+274daf54 Add webpinfo tool.
+ec5036e4 add explicit reference to /usr/local/{lib,inc}
+18f0dfac Merge "fix TIFF encoder regarding rgbA/RGBA"
+4e2b0b50 Merge "webpdec.h: fix a doc typo"
+e2eeabff Merge "Install binaries, libraries and headers in CMake."
+836607e6 webpdec.h: fix a doc typo
+9273e441 fix TIFF encoder regarding rgbA/RGBA
+17e3c11f Add limited PAM decoding support
+5f624871 Install binaries, libraries and headers in CMake.
+976adac1 Merge "lossless incremental decoding: fix missing eos_ test"
+f8fad4fa lossless incremental decoding: fix missing eos_ test
+27415d41 Merge "vwebp_sdl: fix the makefile.unix"
+49566182 Merge "ImgIoUtilWriteFile(): use ImgIoUtilSetBinaryMode"
+6f75a51b Analyze the transform entropy on the whole image.
+a5e4e3af Use palette only if we can in entropy analysis.
+75a9c3c4 Improve compression by better entropy analysis.
+39cf6f4f vwebp_sdl: fix the makefile.unix
+699b0416 ImgIoUtilWriteFile(): use ImgIoUtilSetBinaryMode
+7d985bd1 Fix small entropy analysis bug.
+6e7caf06 Optimize the color cache size.
+833c9219 More efficient stochastic histogram merge.
+5183326b Refactor the greedy histogram merge.
+99f6f462 Merge "histogram_enc.c,MyRand: s/ul/u/ for unsigned constants"
+80a22186 ssim.c: remove dead include
+a128dfff histogram_enc.c,MyRand: s/ul/u/ for unsigned constants
+693bf74e move the SSIM calculation code in ssim.c / ssim_sse2.c
+10d791ca Merge "Fix the random generator in HistogramCombineStochastic."
+fa63a966 Fix the random generator in HistogramCombineStochastic.
+16be192f VP8LSetBitPos: remove the eos_ setting
+027151ca don't erase the surface before blitting.
+4105d565 disable WEBP_USE_XXX optimisations when EMSCRIPTEN is defined
+9ee32a75 Merge "WebP-JS: emscripten-based Javascript decoder"
+ca9f7b7d WebP-JS: emscripten-based Javascript decoder
+868aa690 Perform greedy histogram merge in a unified way.
+5b393f2d Merge "fix path typo for vwebp_sdl in Makefile.vc"
+e0012bea CMake: only use libwebpdecoder for building dwebp
+84c2a7b0 fix path typo for vwebp_sdl in Makefile.vc
+1b0e4abf Merge "Add a flag to disable SIMD optimizations."
+32263250 Add a flag to disable SIMD optimizations.
+b494fdec optimize the ARGB->ARGB Import to use memcpy
+f1536039 Merge "ReadWebP: decode directly into a pre-allocated buffer"
+e69ed291 ReadWebP: decode directly into a pre-allocated buffer
+57d8de8a Merge "vwebp_sdl: simple viewer based on SDL"
+5cfd4ebc LZ77 interval speedups. Faster, smaller, simpler.
+1e7ad88b PNM header decoder: add some basic numerical validation
+17c7890c Merge "Add a decoder only library for WebP in CMake."
+be733786 Merge "Add clang build fix for MSA"
+03cda0e4 Add a decoder only library for WebP in CMake.
+aa893914 Add clang build fix for MSA
+31a92e97 Merge "imageio: add limited PNM support for reading"
+dcf9d82a imageio: add limited PNM support for reading
+6524fcd6 vwebp_sdl: simple viewer based on SDL
+6cf24a24 get_disto: fix reference file read
+43d472aa Merge tag 'v0.6.0'
+50d1a848 update ChangeLog (tag: v0.6.0, origin/0.6.0, 0.6.0)
 20a7fea0 extras/Makefile.am: fix libwebpextras.la reference
 415f3ffe update ChangeLog (tag: v0.6.0-rc3)
 3c6d1224 update NEWS
 ee4a4141 update AUTHORS
 32ed856f Fix "all|no frames are keyframes" settings.
+1c3190b6 Merge "Fix "all|no frames are keyframes" settings."
 f4dc56fd disable GradientUnfilter_NEON
+4f3e3bbd disable GradientUnfilter_NEON
+2dc0bdca Fix "all|no frames are keyframes" settings.
 0d8e0588 img2webp: treat -loop as a no-op w/single images
 b0450139 ReadImage(): restore size reporting
 0ad3b4ef update ChangeLog (tag: v0.6.0-rc2)
--- a/Makefile.vc
+++ b/Makefile.vc
@ -29,7 +29,7 @@ PLATFORM_LDFLAGS = /SAFESEH
 NOLOGO     = /nologo
 CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
+CFLAGS     = /I. /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
@ -155,6 +155,7 @@ CFGSET = TRUE
 !MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE - gif2webp                       - requires libgif & >= VS2013
 !MESSAGE - anim_diff                      - requires libgif & >= VS2013
+!MESSAGE - anim_dump
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
 !MESSAGE   'legacy' will produce a Windows 2000 compatible library.
@ -233,9 +234,6 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\yuv_sse2.obj \

 DSP_ENC_OBJS = \
-    $(DIROBJ)\dsp\argb.obj \
-    $(DIROBJ)\dsp\argb_mips_dsp_r2.obj \
-    $(DIROBJ)\dsp\argb_sse2.obj \
    $(DIROBJ)\dsp\cost.obj \
    $(DIROBJ)\dsp\cost_mips32.obj \
    $(DIROBJ)\dsp\cost_mips_dsp_r2.obj \
@ -358,10 +356,15 @@ all: ex $(EXTRA_EXAMPLES)
 # C99 support which is only available from VS2013 onward.
 gif2webp: $(DIRBIN)\gif2webp.exe
 anim_diff: $(DIRBIN)\anim_diff.exe
+anim_dump: $(DIRBIN)\anim_dump.exe

 $(DIRBIN)\anim_diff.exe: $(DIROBJ)\examples\anim_diff.obj $(EX_ANIM_UTIL_OBJS)
 $(DIRBIN)\anim_diff.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\anim_diff.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
+$(DIRBIN)\anim_dump.exe: $(DIROBJ)\examples\anim_dump.obj $(EX_ANIM_UTIL_OBJS)
+$(DIRBIN)\anim_dump.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS)
+$(DIRBIN)\anim_dump.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
+$(DIRBIN)\anim_dump.exe: $(IMAGEIO_ENC_OBJS)
 $(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(IMAGEIO_DEC_OBJS)
 $(DIRBIN)\cwebp.exe: $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj $(IMAGEIO_DEC_OBJS)
@ -444,7 +447,7 @@ $(OUTPUT_DIRS):
 $(DIROBJ)\$(DLLINC):
 	@echo #ifndef WEBP_DLL_H_ > $@
 	@echo #define WEBP_DLL_H_ >> $@
-	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
+	@echo #define WEBP_EXTERN __declspec(dllexport) >> $@
 	@echo #endif  /* WEBP_DLL_H_ */ >> $@

 .SUFFIXES: .c .obj .res .exe
@ -456,6 +459,9 @@ $(DIROBJ)\dsp\enc_avx2.obj: src\dsp\enc_avx2.c
 $(DIROBJ)\examples\anim_diff.obj: examples\anim_diff.c
 	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
 	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
+$(DIROBJ)\examples\anim_dump.obj: examples\anim_dump.c
+	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
+	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
 $(DIROBJ)\examples\anim_util.obj: examples\anim_util.c
 	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
 	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
--- a/13
+++ b/13
@ -1,3 +1,16 @@
+- 11/24/2017: version 0.6.1
+  This is a binary compatible release.
+  * lossless performance and compression improvements + a new 'cruncher' mode
+    (-m 6 -q 100)
+  * ARM performance improvements with clang (15-20% w/ndk r15c, issue #339)
+  * webp-js: emscripten/webassembly based javascript decoder
+  * miscellaneous bug & build fixes (issue #329, #332, #343, #353, #360, #361,
+    #363)
+  Tool updates / additions:
+    added webpinfo - prints file format information (issue #330)
+    gif2webp - loop behavior modified to match Chrome M63+ (crbug.com/649264);
+               '-loop_compatibility' can be used for the old behavior
+
 - 1/26/2017: version 0.6.0
  * lossless performance and compression improvements
  * miscellaneous performance improvements (SSE2, NEON, MSA)
--- a/25
+++ b/25
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.6.0
+               \____/____/\_____/_____/____/v0.6.1

 Description:
 ============
@ -113,8 +113,8 @@ make install

 CMake:
 ------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp and the
-JS bindings.
+With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo
+and the JS bindings.

 Prerequisites:
 A compiler (e.g., gcc with autotools) and CMake.
@ -367,6 +367,23 @@ Use following options to convert into alternate image formats:
  -quiet ....... quiet mode, don't print anything
  -noasm ....... disable all assembly optimizations

+WebP file analysis tool:
+========================
+
+'webpinfo' can be used to print out the chunk level structure and bitstream
+header information of WebP files. It can also check if the files are of valid
+WebP format.
+
+Usage: webpinfo [options] in_files
+Note: there could be multiple input files;
+      options must come before input files.
+Options:
+  -version ........... Print version number and exit.
+  -quiet ............. Do not show chunk parsing information.
+  -diag .............. Show parsing error diagnosis.
+  -summary ........... Show chunk stats summary.
+  -bitstream_info .... Parse bitstream header.
+
 Visualization tool:
 ===================

@ -477,6 +494,8 @@ Options:
  -metadata <string> ..... comma separated list of metadata to
                           copy from the input to the output if present
                           Valid values: all, none, icc, xmp (default)
+  -loop_compatibility .... use compatibility mode for Chrome
+                           version prior to M62 (inclusive)
  -mt .................... use multi-threading if available

  -version ............... print version number and exit
--- a/README.mux
+++ b/README.mux
@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.4.0
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.4.1


 Description:
--- a/README.wasm
+++ b/README.wasm
@ -1,91 +0,0 @@
-Description:
-============
-
-This file describes the compilation of libwebp using portable intrinsics /
-WebAssembly (wasm) to native targets using clang and CMake.
-
-Prerequisites:
-==============
-
- cmake 2.8+
-
- clang 3.9+ for portable intrinsics support; as wasm progresses a tip of tree
-  build may be necessary.
-
-Building:
-=========
-
- - configure the project with CMake using:
-
- $ mkdir -p build && \
-   cd build && \
-   cmake -DWEBP_BUILD_DWEBP=1 -DCMAKE_C_COMPILER=clang -DWEBP_ENABLE_WASM=1 ../
-
- - compile dwebp using 'make'.
-
- - Note this currently generates native executables only and is incompatible
-   with -DWEBP_BUILD_WEBP_JS.
-
-Build options:
-==============
-
- platform specific multiply high (mulhi) implementation, disabled by default.
-  arm: -DCMAKE_C_FLAGS='-DENABLE_NEON_BUILTIN_MULHI_INT16X8 ...'
-  x86: -DCMAKE_C_FLAGS='-DENABLE_X86_BUILTIN_MULHI_INT16X8 ...'
-
-Cross compilation:
-==================
-
- - arm toolchains can be obtained from:
-   http://www.linaro.org/downloads/
-
- - the android ndk can be obtained from:
-   https://developer.android.com/ndk/downloads/index.html
-
-armv7:
------
-
-Android:
- $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
-   --arch arm --api 24 --stl gnustl --install-dir /opt/android-arm-24
- $ mkdir -p build && cd build
- $ cmake ../libwebp \
-   -DWEBP_BUILD_DWEBP=1 \
-   -DCMAKE_C_COMPILER=/opt/android-arm-24/bin/clang \
-   -DCMAKE_PREFIX_PATH=/opt/android-arm-24/sysroot/usr/lib \
-   -DCMAKE_C_FLAGS=-fPIE \
-   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
-   -DCMAKE_BUILD_TYPE=Release \
-   -DWEBP_ENABLE_WASM=1
-
-Linux:
- $ gcc_arm=/opt/gcc-arm; target=arm-linux-gnueabihf
- $ mkdir -p build && cd build
- $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
-   -DCMAKE_C_COMPILER=clang \
-   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc -march=armv7-a -mfpu=neon" \
-   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
-
-aarch64 / arm64:
----------------
-
-Android:
- $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
-   --arch arm64 --api 24 --stl gnustl --install-dir /opt/android-arm64-24
- $ mkdir -p build && cd build
- $ cmake ../libwebp \
-   -DWEBP_BUILD_DWEBP=1 \
-   -DCMAKE_C_COMPILER=/opt/android-arm64-24/bin/clang \
-   -DCMAKE_PREFIX_PATH=/opt/android-arm64-24/sysroot/usr/lib \
-   -DCMAKE_C_FLAGS=-fPIE \
-   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
-   -DCMAKE_BUILD_TYPE=Release \
-   -DWEBP_ENABLE_WASM=1
-
-Linux:
- $ gcc_arm=/opt/gcc-aarch64; target=aarch64-linux-gnu
- $ mkdir -p build && cd build
- $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
-   -DCMAKE_C_COMPILER=clang \
-   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc" \
-   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
--- a/README.webp_js
+++ b/README.webp_js
@ -31,11 +31,6 @@ using Emscripten and CMake.
 - that's it! Upon completion, you should have the webp.js and
   webp.js.mem files generated.

- - Note this generates both webp_js and webp_wasm without any SIMD enabled due
-   to bugs with this toolchain associated with the SSE2 code.
-   -DWEBP_ENABLE_WASM is currently meant to generate native (x86, arm)
-   executables (dwebp, cwebp) and is incompatible with -DWEBP_BUILD_WEBP_JS.
-
 The callable JavaScript function is WebPToSDL(), which decodes a raw WebP
 bitstream into a canvas. See webp_js/index.html for a simple usage sample.

--- a/build.gradle
+++ b/build.gradle
@ -82,12 +82,14 @@ model {
        }
      }
      // Check for NEON usage.
-      if (getTargetPlatform() == "arm" || getTargetPlatform() == "arm64") {
+      if (getTargetPlatform() == "arm") {
        NEON = "c.neon"
        cCompiler.define "HAVE_CPU_FEATURES_H"
      } else {
        NEON = "c"
      }
+
+      cCompiler.args "-I" + file(".").absolutePath
    }
    // Link to pthread for shared libraries.
    withType(SharedLibraryBinarySpec) {
@ -120,9 +122,6 @@ model {
            include "alpha_processing_neon.$NEON"
            include "alpha_processing_sse2.c"
            include "alpha_processing_sse41.c"
-            include "argb.c"
-            include "argb_mips_dsp_r2.c"
-            include "argb_sse2.c"
            include "cpu.c"
            include "dec.c"
            include "dec_clip_tables.c"
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -13,6 +13,9 @@
 /* Set to 1 if __builtin_bswap64 is available */
 #cmakedefine HAVE_BUILTIN_BSWAP64 1

+/* Define to 1 if you have the <cpu-features.h> header file. */
+#cmakedefine HAVE_CPU_FEATURES_H 1
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #cmakedefine HAVE_DLFCN_H 1

@ -115,9 +118,19 @@
 /* Set to 1 if JPEG library is installed */
 #cmakedefine WEBP_HAVE_JPEG 1

+/* Set to 1 if NEON is supported */
+#cmakedefine WEBP_HAVE_NEON
+
+/* Set to 1 if runtime detection of NEON is enabled */
+/* TODO: handle properly in CMake */
+#cmakedefine WEBP_HAVE_NEON_RTCD
+
 /* Set to 1 if PNG library is installed */
 #cmakedefine WEBP_HAVE_PNG 1

+/* Set to 1 if SDL library is installed */
+#cmakedefine WEBP_HAVE_SDL 1
+
 /* Set to 1 if SSE2 is supported */
 #cmakedefine WEBP_HAVE_SSE2 1

@ -127,6 +140,9 @@
 /* Set to 1 if TIFF library is installed */
 #cmakedefine WEBP_HAVE_TIFF 1

+/* Enable near lossless encoding */
+#cmakedefine WEBP_NEAR_LOSSLESS 1
+
 /* Undefine this to disable thread support. */
 #cmakedefine WEBP_USE_THREAD 1

--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@ -1,4 +1,5 @@
 ## Check for SIMD extensions.
+include(CMakePushCheckState)

 function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
  if(NOT ENABLE_SIMD)
@ -7,6 +8,8 @@ function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
    return()
  endif()
  unset(WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG} CACHE)
+  cmake_push_check_state()
+  set(CMAKE_REQUIRED_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR})
  check_c_source_compiles("
      #include \"${CMAKE_CURRENT_LIST_DIR}/../src/dsp/dsp.h\"
      int main(void) {
@ -17,6 +20,7 @@ function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
      }
    " WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG}
  )
+  cmake_pop_check_state()
  if(WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG})
    set(WEBP_HAVE_${WEBP_SIMD_FLAG} 1 PARENT_SCOPE)
  else()
@ -60,6 +64,7 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
  # First try with no extra flag added as the compiler might have default flags
  # (especially on Android).
  unset(WEBP_HAVE_${WEBP_SIMD_FLAG} CACHE)
+  cmake_push_check_state()
  set(CMAKE_REQUIRED_FLAGS)
  webp_check_compiler_flag(${WEBP_SIMD_FLAG} ${WEBP_ENABLE_SIMD})
  if(NOT WEBP_HAVE_${WEBP_SIMD_FLAG})
@ -85,11 +90,8 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
    foreach(FILE ${SIMD_FILES})
      list(APPEND WEBP_SIMD_FILES_NOT_TO_INCLUDE ${FILE})
    endforeach()
-    # Explicitly disable SIMD. Avoid this with WASM to avoid an ICE with clang:
-    # https://bugs.chromium.org/p/webp/issues/detail?id=350
-    # WASM overrides the native SIMD so building it in is harmless aside from
-    # binary size.
-    if(NOT WEBP_ENABLE_WASM AND SIMD_DISABLE_FLAGS)
+    # Explicitly disable SIMD.
+    if(SIMD_DISABLE_FLAGS)
      list(GET SIMD_DISABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
      include(CheckCCompilerFlag)
      if(SIMD_COMPILE_FLAG)
@ -104,11 +106,12 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
            set(COMMON_PATTERNS)
          endif()
          set(CMAKE_REQUIRED_DEFINITIONS ${SIMD_COMPILE_FLAG})
-          check_c_source_compiles("int main(void) {return 0;}" FLAG2
+          check_c_source_compiles("int main(void) {return 0;}"
+            FLAG_${SIMD_COMPILE_FLAG}
            FAIL_REGEX "warning: argument unused during compilation:"
            ${COMMON_PATTERNS}
          )
-          if(NOT FLAG2)
+          if(NOT FLAG_${SIMD_COMPILE_FLAG})
            unset(HAS_COMPILE_FLAG CACHE)
          endif()
        endif()
@ -118,14 +121,5 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
      endif()
    endif()
  endif()
+  cmake_pop_check_state()
 endforeach()
-
-## Add *_wasm.c files if enabled.
-if(WEBP_ENABLE_WASM)
-  file(GLOB SIMD_FILES "${CMAKE_CURRENT_LIST_DIR}/../"
-    "src/dsp/*_wasm.c"
-  )
-  foreach(FILE ${SIMD_FILES})
-    list(APPEND WEBP_SIMD_FILES_TO_INCLUDE ${FILE})
-  endforeach()
-endif()
--- a/cmake/config.h.cmake
+++ b/cmake/config.h.cmake
@ -70,18 +70,43 @@ foreach(I_LIB PNG JPEG TIFF)
  set(WEBP_HAVE_${I_LIB} ${${I_LIB}_FOUND})
  if(${I_LIB}_FOUND)
    list(APPEND WEBP_DEP_IMG_LIBRARIES ${${I_LIB}_LIBRARIES})
-    list(APPEND WEBP_DEP_IMG_INCLUDE_DIRS ${${I_LIB}_INCLUDE_DIRS})
+    list(APPEND WEBP_DEP_IMG_INCLUDE_DIRS
+         ${${I_LIB}_INCLUDE_DIR} ${${I_LIB}_INCLUDE_DIRS})
  endif()
 endforeach()
+if(WEBP_DEP_IMG_INCLUDE_DIRS)
+  list(REMOVE_DUPLICATES WEBP_DEP_IMG_INCLUDE_DIRS)
+endif()

 # GIF detection, gifdec isn't part of the imageio lib.
+include(CMakePushCheckState)
 set(WEBP_DEP_GIF_LIBRARIES)
 set(WEBP_DEP_GIF_INCLUDE_DIRS)
 find_package(GIF)
 set(WEBP_HAVE_GIF ${GIF_FOUND})
 if(GIF_FOUND)
-  list(APPEND WEBP_DEP_GIF_LIBRARIES ${GIF_LIBRARIES})
-  list(APPEND WEBP_DEP_GIF_INCLUDE_DIRS ${GIF_INCLUDE_DIR})
+  # GIF find_package only locates the header and library, it doesn't fail
+  # compile tests when detecting the version, but falls back to 3 (as of at
+  # least cmake 3.7.2). Make sure the library links to avoid incorrect
+  # detection when cross compiling.
+  cmake_push_check_state()
+  set(CMAKE_REQUIRED_LIBRARIES ${GIF_LIBRARIES})
+  set(CMAKE_REQUIRED_INCLUDES ${GIF_INCLUDE_DIR})
+  check_c_source_compiles("
+      #include <gif_lib.h>
+      int main(void) {
+        (void)DGifOpenFileHandle;
+        return 0;
+      }
+      " GIF_COMPILES
+  )
+  cmake_pop_check_state()
+  if(GIF_COMPILES)
+    list(APPEND WEBP_DEP_GIF_LIBRARIES ${GIF_LIBRARIES})
+    list(APPEND WEBP_DEP_GIF_INCLUDE_DIRS ${GIF_INCLUDE_DIR})
+  else()
+    unset(GIF_FOUND)
+  endif()
 endif()

 ## Check for specific headers.
@ -139,13 +164,3 @@ strip_bracket(PACKAGE_URL)
 set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
 set(PACKAGE_TARNAME ${PACKAGE_NAME})
 set(VERSION ${PACKAGE_VERSION})
-
-## Generate the config.h header.
-configure_file(${CMAKE_CURRENT_LIST_DIR}/config.h.in
-  ${CMAKE_CURRENT_BINARY_DIR}/include/webp/config.h)
-add_definitions(-DHAVE_CONFIG_H)
-# The webp folder is included as we reference config.h as
-# ../webp/config.h or webp/config.h
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/include
-  ${CMAKE_CURRENT_BINARY_DIR}/include/webp
-)
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([libwebp], [0.6.0],
+AC_INIT([libwebp], [0.6.1],
        [https://bugs.chromium.org/p/webp],,
        [http://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
@ -79,6 +79,7 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wparentheses-equality])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wundef])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
@ -444,12 +445,12 @@ AS_IF([test "x$enable_sdl" != "xno"], [
  CLEAR_LIBVARS([SDL])
  WITHLIB_OPTION([sdl], [SDL])

-  $sdl_header = "no";
+  sdl_header="no"
  LIBCHECK_PROLOGUE([SDL])
  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL_SDL.h"],
                  [AC_CHECK_HEADER([SDL.h], [sdl_header="SDL.h"],
                  [AC_MSG_WARN(SDL library not available - no sdl.h)])])
-  if test x"$sdl_header" != "xno" ; then
+  if test x"$sdl_header" != "xno"; then
    AC_CHECK_LIB(SDL, SDL_Init,
                 [SDL_LIBS="-lSDL"
                  SDL_INCLUDES="-DWEBP_HAVE_SDL"
@ -458,14 +459,14 @@ AS_IF([test "x$enable_sdl" != "xno"], [
                  sdl_support=yes
                 ],
                 AC_MSG_WARN(Optional SDL library not found),
-                 [$MATH_LIBS]),
-    if test x"$sdl_header" == "xSDL.h" ; then
+                 [$MATH_LIBS])
+    if test x"$sdl_header" = "xSDL.h"; then
      SDL_INCLUDES="$SDL_INCLUDES -DWEBP_HAVE_JUST_SDL_H"
    fi
  fi
  LIBCHECK_EPILOGUE([SDL])

-  if test "$sdl_support" = "yes" ; then
+  if test "$sdl_support" = "yes"; then
    build_vwebp_sdl=yes
  fi
 ])
@ -589,7 +590,7 @@ AS_IF([test "x$enable_gif" != "xno"], [

  if test "$gif_support" = "yes" -a \
          "$enable_libwebpdemux" = "yes"; then
-    build_animdiff=yes
+    build_anim_diff=yes
  fi

  if test "$gif_support" = "yes" -a \
@ -597,7 +598,7 @@ AS_IF([test "x$enable_gif" != "xno"], [
    build_gif2webp=yes
  fi
 ])
-AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_animdiff}" = "yes"])
+AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_anim_diff}" = "yes"])
 AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 if test "$enable_libwebpmux" = "yes"; then
@ -662,7 +663,7 @@ if test "$enable_wic" = "yes"; then
 fi
 esac

-dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP
+dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP=1

 USE_SWAP_16BIT_CSP=""
 AC_MSG_CHECKING(if --enable-swap-16bit-csp option is specified)
@ -670,7 +671,7 @@ AC_ARG_ENABLE([swap-16bit-csp],
              AS_HELP_STRING([--enable-swap-16bit-csp],
                             [Enable byte swap for 16 bit colorspaces]))
 if test "$enable_swap_16bit_csp" = "yes"; then
-  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP"
+  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP=1"
 fi
 AC_MSG_RESULT(${enable_swap_16bit_csp-no})
 AC_SUBST(USE_SWAP_16BIT_CSP)
@ -688,6 +689,21 @@ fi
 AC_MSG_RESULT(${enable_experimental-no})
 AC_SUBST(USE_EXPERIMENTAL_CODE)

+dnl === If --disable-near-lossless is defined, add -DWEBP_NEAR_LOSSLESS=0
+
+AC_DEFINE(WEBP_NEAR_LOSSLESS, [1], [Enable near lossless encoding])
+AC_MSG_CHECKING(if --disable-near-lossless option is specified)
+AC_ARG_ENABLE([near_lossless],
+              AS_HELP_STRING([--disable-near-lossless],
+                             [Disable near lossless encoding]),
+              [], [enable_near_lossless=yes])
+if test "$enable_near_lossless" = "no"; then
+  AC_DEFINE(WEBP_NEAR_LOSSLESS, [0], [Enable near lossless encoding])
+  AC_MSG_RESULT([yes])
+else
+  AC_MSG_RESULT([no])
+fi
+
 dnl === Check whether libwebpmux should be built
 AC_MSG_CHECKING(whether libwebpmux is to be built)
 AC_ARG_ENABLE([libwebpmux],
@ -762,7 +778,7 @@ dwebp : yes
  PNG  : ${png_support-no}
  WIC  : ${wic_support-no}
 GIF support : ${gif_support-no}
-anim_diff   : ${build_animdiff-no}
+anim_diff   : ${build_anim_diff-no}
 gif2webp    : ${build_gif2webp-no}
 img2webp    : ${build_img2webp-no}
 webpmux     : ${enable_libwebpmux-no}
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -2,7 +2,7 @@ AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
 if BUILD_ANIMDIFF
-  noinst_PROGRAMS = anim_diff
+  noinst_PROGRAMS = anim_diff anim_dump
 endif
 if BUILD_GIF2WEBP
  bin_PROGRAMS += gif2webp
@ -27,20 +27,36 @@ libexample_util_la_LIBADD = ../src/libwebp.la

 anim_diff_SOURCES = anim_diff.c anim_util.c anim_util.h
 anim_diff_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
-anim_diff_LDADD  = ../src/demux/libwebpdemux.la
-anim_diff_LDADD += libexample_util.la ../imageio/libimageio_util.la
+anim_diff_LDADD  =
+anim_diff_LDADD += ../src/demux/libwebpdemux.la
+anim_diff_LDADD += libexample_util.la
+anim_diff_LDADD += ../imageio/libimageio_util.la
 anim_diff_LDADD += $(GIF_LIBS) -lm

+anim_dump_SOURCES = anim_dump.c anim_util.c anim_util.h
+anim_dump_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(PNG_INCLUDES)
+anim_dump_CPPFLAGS += $(GIF_INCLUDES)
+anim_dump_LDADD  =
+anim_dump_LDADD += ../src/demux/libwebpdemux.la
+anim_dump_LDADD += libexample_util.la
+anim_dump_LDADD += ../imageio/libimageio_util.la
+anim_dump_LDADD += ../imageio/libimageenc.la
+anim_dump_LDADD += $(PNG_LIBS) $(GIF_LIBS) $(TIFF_LIBS) -lm
+
 cwebp_SOURCES  = cwebp.c stopwatch.h
 cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-cwebp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
-cwebp_LDADD += ../imageio/libimagedec.la ../src/libwebp.la
+cwebp_LDADD  =
+cwebp_LDADD += libexample_util.la
+cwebp_LDADD += ../imageio/libimageio_util.la
+cwebp_LDADD += ../imageio/libimagedec.la
+cwebp_LDADD += ../src/libwebp.la
 cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

 dwebp_SOURCES = dwebp.c stopwatch.h
 dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
-dwebp_LDADD  = libexample_util.la
+dwebp_LDADD  =
+dwebp_LDADD += libexample_util.la
 dwebp_LDADD += ../imageio/libimagedec.la
 dwebp_LDADD += ../imageio/libimageenc.la
 dwebp_LDADD += ../imageio/libimageio_util.la
@ -49,35 +65,52 @@ dwebp_LDADD +=$(PNG_LIBS) $(JPEG_LIBS)

 gif2webp_SOURCES = gif2webp.c gifdec.c gifdec.h
 gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
-gif2webp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
-gif2webp_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la $(GIF_LIBS)
+gif2webp_LDADD  =
+gif2webp_LDADD += libexample_util.la
+gif2webp_LDADD += ../imageio/libimageio_util.la
+gif2webp_LDADD += ../src/mux/libwebpmux.la
+gif2webp_LDADD += ../src/libwebp.la
+gif2webp_LDADD += $(GIF_LIBS)

 vwebp_SOURCES = vwebp.c
 vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
-vwebp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
-vwebp_LDADD += ../src/demux/libwebpdemux.la $(GL_LIBS)
+vwebp_LDADD  =
+vwebp_LDADD += libexample_util.la
+vwebp_LDADD += ../imageio/libimageio_util.la
+vwebp_LDADD += ../src/demux/libwebpdemux.la
+vwebp_LDADD += $(GL_LIBS)

 webpmux_SOURCES = webpmux.c
 webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webpmux_LDADD  = libexample_util.la ../imageio/libimageio_util.la
-webpmux_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la
+webpmux_LDADD  =
+webpmux_LDADD += libexample_util.la
+webpmux_LDADD += ../imageio/libimageio_util.la
+webpmux_LDADD += ../src/mux/libwebpmux.la
+webpmux_LDADD += ../src/libwebp.la

 img2webp_SOURCES = img2webp.c
 img2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-img2webp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+img2webp_LDADD  =
+img2webp_LDADD += libexample_util.la
+img2webp_LDADD += ../imageio/libimageio_util.la
 img2webp_LDADD += ../imageio/libimagedec.la
-img2webp_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la
+img2webp_LDADD += ../src/mux/libwebpmux.la
+img2webp_LDADD += ../src/libwebp.la
 img2webp_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

 webpinfo_SOURCES = webpinfo.c
 webpinfo_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webpinfo_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+webpinfo_LDADD  =
+webpinfo_LDADD += libexample_util.la
+webpinfo_LDADD += ../imageio/libimageio_util.la
 webpinfo_LDADD += ../src/libwebp.la

 if BUILD_LIBWEBPDECODER
  anim_diff_LDADD += ../src/libwebpdecoder.la
+  anim_dump_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
 else
  anim_diff_LDADD += ../src/libwebp.la
+  anim_dump_LDADD += ../src/libwebp.la
  vwebp_LDADD += ../src/libwebp.la
 endif
--- a/examples/anim_dump.c
+++ b/examples/anim_dump.c
@ -0,0 +1,104 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Decodes an animated WebP file and dumps the decoded frames as PNG or TIFF.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdio.h>
+#include <string.h>  // for 'strcmp'.
+
+#include "./anim_util.h"
+#include "webp/decode.h"
+#include "../imageio/image_enc.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+static void Help(void) {
+  printf("Usage: anim_dump [options] files...\n");
+  printf("\nOptions:\n");
+  printf("  -folder <string> .... dump folder (default: '.')\n");
+  printf("  -prefix <string> .... prefix for dumped frames "
+                                  "(default: 'dump_')\n");
+  printf("  -tiff ............... save frames as TIFF\n");
+  printf("  -pam ................ save frames as PAM\n");
+}
+
+int main(int argc, const char* argv[]) {
+  int error = 0;
+  const char* dump_folder = ".";
+  const char* prefix = "dump_";
+  const char* suffix = "png";
+  WebPOutputFileFormat format = PNG;
+  int c;
+
+  if (argc < 2) {
+    Help();
+    return -1;
+  }
+
+  for (c = 1; !error && c < argc; ++c) {
+    if (!strcmp(argv[c], "-folder")) {
+      if (c + 1 == argc) {
+        fprintf(stderr, "missing argument after option '%s'\n", argv[c]);
+        error = 1;
+        break;
+      }
+      dump_folder = argv[++c];
+    } else if (!strcmp(argv[c], "-prefix")) {
+      if (c + 1 == argc) {
+        fprintf(stderr, "missing argument after option '%s'\n", argv[c]);
+        error = 1;
+        break;
+      }
+      prefix = argv[++c];
+    } else if (!strcmp(argv[c], "-tiff")) {
+      format = TIFF;
+      suffix = "tiff";
+    } else if (!strcmp(argv[c], "-pam")) {
+      format = PAM;
+      suffix = "pam";
+    } else {
+      uint32_t i;
+      AnimatedImage image;
+      const char* const file = argv[c];
+      memset(&image, 0, sizeof(image));
+      printf("Decoding file: %s as %s/%sxxxx.%s\n",
+             file, dump_folder, prefix, suffix);
+      if (!ReadAnimatedImage(file, &image, 0, NULL)) {
+        fprintf(stderr, "Error decoding file: %s\n Aborting.\n", file);
+        error = 1;
+        break;
+      }
+      for (i = 0; !error && i < image.num_frames; ++i) {
+        char out_file[1024];
+        WebPDecBuffer buffer;
+        WebPInitDecBuffer(&buffer);
+        buffer.colorspace = MODE_RGBA;
+        buffer.is_external_memory = 1;
+        buffer.width = image.canvas_width;
+        buffer.height = image.canvas_height;
+        buffer.u.RGBA.rgba = image.frames[i].rgba;
+        buffer.u.RGBA.stride = buffer.width * sizeof(uint32_t);
+        buffer.u.RGBA.size = buffer.u.RGBA.stride * buffer.height;
+        snprintf(out_file, sizeof(out_file), "%s/%s%.4d.%s",
+                 dump_folder, prefix, i, suffix);
+        if (!WebPSaveImage(&buffer, format, out_file)) {
+          fprintf(stderr, "Error while saving image '%s'\n", out_file);
+          error = 1;
+        }
+        WebPFreeDecBuffer(&buffer);
+      }
+      ClearAnimatedImage(&image);
+    }
+  }
+  return error ? 1 : 0;
+}
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@ -16,7 +16,7 @@
 #include <stdio.h>
 #include <string.h>

-#ifdef WEBP_HAVE_GIF
+#if defined(WEBP_HAVE_GIF)
 #include <gif_lib.h>
 #endif
 #include "webp/format_constants.h"
@ -33,11 +33,13 @@ static const int kNumChannels = 4;
 // -----------------------------------------------------------------------------
 // Common utilities.

+#if defined(WEBP_HAVE_GIF)
 // Returns true if the frame covers the full canvas.
 static int IsFullFrame(int width, int height,
                       int canvas_width, int canvas_height) {
  return (width == canvas_width && height == canvas_height);
 }
+#endif // WEBP_HAVE_GIF

 static int CheckSizeForOverflow(uint64_t size) {
  return (size == (size_t)size);
@ -85,6 +87,7 @@ void ClearAnimatedImage(AnimatedImage* const image) {
  }
 }

+#if defined(WEBP_HAVE_GIF)
 // Clear the canvas to transparent.
 static void ZeroFillCanvas(uint8_t* rgba,
                           uint32_t canvas_width, uint32_t canvas_height) {
@ -126,6 +129,7 @@ static void CopyFrameRectangle(const uint8_t* src, uint8_t* dst, int stride,
    dst += stride;
  }
 }
+#endif // WEBP_HAVE_GIF

 // Canonicalize all transparent pixels to transparent black to aid comparison.
 static void CleanupTransparentPixels(uint32_t* rgba,
@ -152,6 +156,8 @@ static int DumpFrame(const char filename[], const char dump_folder[],
  FILE* f = NULL;
  const char* row;

+  if (dump_folder == NULL) dump_folder = ".";
+
  base_name = strrchr(filename, '/');
  base_name = (base_name == NULL) ? filename : base_name + 1;
  max_len = strlen(dump_folder) + 1 + strlen(base_name)
@ -200,7 +206,7 @@ static int IsWebP(const WebPData* const webp_data) {
  return (WebPGetInfo(webp_data->bytes, webp_data->size, NULL, NULL) != 0);
 }

-// Read animated WebP bitstream 'file_str' into 'AnimatedImage' struct.
+// Read animated WebP bitstream 'webp_data' into 'AnimatedImage' struct.
 static int ReadAnimatedWebP(const char filename[],
                            const WebPData* const webp_data,
                            AnimatedImage* const image, int dump_frames,
@ -278,7 +284,7 @@ static int ReadAnimatedWebP(const char filename[],
 // -----------------------------------------------------------------------------
 // GIF Decoding.

-#ifdef WEBP_HAVE_GIF
+#if defined(WEBP_HAVE_GIF)

 // Returns true if this is a valid GIF bitstream.
 static int IsGIF(const WebPData* const data) {
@ -423,6 +429,11 @@ static uint32_t GetBackgroundColorGIF(GifFileType* gif) {
 }

 // Find appropriate app extension and get loop count from the next extension.
+// We use Chrome's interpretation of the 'loop_count' semantics:
+//   if not present -> loop once
+//   if present and loop_count == 0, return 0 ('infinite').
+//   if present and loop_count != 0, it's the number of *extra* loops
+//     so we need to return loop_count + 1 as total loop number.
 static uint32_t GetLoopCountGIF(const GifFileType* const gif) {
  int i;
  for (i = 0; i < gif->ImageCount; ++i) {
@ -440,12 +451,13 @@ static uint32_t GetLoopCountGIF(const GifFileType* const gif) {
      if (signature_is_ok &&
          eb2->Function == CONTINUE_EXT_FUNC_CODE && eb2->ByteCount >= 3 &&
          eb2->Bytes[0] == 1) {
-        return ((uint32_t)(eb2->Bytes[2]) << 8) +
-               ((uint32_t)(eb2->Bytes[1]) << 0);
+        const uint32_t extra_loop = ((uint32_t)(eb2->Bytes[2]) << 8) +
+                                    ((uint32_t)(eb2->Bytes[1]) << 0);
+        return (extra_loop > 0) ? extra_loop + 1 : 0;
      }
    }
  }
-  return 0;  // Default.
+  return 1;  // Default.
 }

 // Get duration of 'n'th frame in milliseconds.
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -463,8 +463,9 @@ static int WriteWebPWithMetadata(FILE* const out,
    } else {
      const int is_lossless = !memcmp(webp, "VP8L", kTagSize);
      if (is_lossless) {
-        // Presence of alpha is stored in the 29th bit of VP8L data.
-        if (webp[kChunkHeaderSize + 3] & (1 << 5)) flags |= kAlphaFlag;
+        // Presence of alpha is stored in the 37th bit (29th after the
+        // signature) of VP8L data.
+        if (webp[kChunkHeaderSize + 4] & (1 << 4)) flags |= kAlphaFlag;
      }
      ok = ok && (fwrite(kVP8XHeader, kChunkHeaderSize, 1, out) == 1);
      ok = ok && WriteLE32(out, flags);
@ -486,10 +487,10 @@ static int WriteWebPWithMetadata(FILE* const out,
      *metadata_written |= METADATA_XMP;
    }
    return ok;
-  } else {
-    // No metadata, just write the original image file.
-    return (fwrite(webp, webp_size, 1, out) == 1);
  }
+
+  // No metadata, just write the original image file.
+  return (fwrite(webp, webp_size, 1, out) == 1);
 }

 //------------------------------------------------------------------------------
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -72,8 +72,10 @@ static void Help(void) {
  printf("  -metadata <string> ..... comma separated list of metadata to\n");
  printf("                           ");
  printf("copy from the input to the output if present\n");
-  printf("                           "
-         "Valid values: all, none, icc, xmp (default)\n");
+  printf("                           ");
+  printf("Valid values: all, none, icc, xmp (default)\n");
+  printf("  -loop_compatibility .... use compatibility mode for Chrome\n");
+  printf("                           version prior to M62 (inclusive)\n");
  printf("  -mt .................... use multi-threading if available\n");
  printf("\n");
  printf("  -version ............... print version number and exit\n");
@ -104,7 +106,7 @@ int main(int argc, const char *argv[]) {
  WebPAnimEncoderOptions enc_options;
  WebPConfig config;

-  int is_first_frame = 1;     // Whether we are processing the first frame.
+  int frame_number = 0;     // Whether we are processing the first frame.
  int done;
  int c;
  int quiet = 0;
@ -115,8 +117,9 @@ int main(int argc, const char *argv[]) {
  int stored_icc = 0;         // Whether we have already stored an ICC profile.
  WebPData xmp_data;
  int stored_xmp = 0;         // Whether we have already stored an XMP profile.
-  int loop_count = 0;
+  int loop_count = 0;         // default: infinite
  int stored_loop_count = 0;  // Whether we have found an explicit loop count.
+  int loop_compatibility = 0;
  WebPMux* mux = NULL;

  int default_kmin = 1;  // Whether to use default kmin value.
@ -151,6 +154,8 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-mixed")) {
      enc_options.allow_mixed = 1;
      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-loop_compatibility")) {
+      loop_compatibility = 1;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
@ -277,7 +282,7 @@ int main(int argc, const char *argv[]) {

        if (!DGifGetImageDesc(gif)) goto End;

-        if (is_first_frame) {
+        if (frame_number == 0) {
          if (verbose) {
            printf("Canvas screen: %d x %d\n", gif->SWidth, gif->SHeight);
          }
@ -319,7 +324,6 @@ int main(int argc, const char *argv[]) {
                    "a memory error.\n");
            goto End;
          }
-          is_first_frame = 0;
        }

        // Some even more broken GIF can have sub-rect with zero width/height.
@ -336,7 +340,11 @@ int main(int argc, const char *argv[]) {
        GIFBlendFrames(&frame, &gif_rect, &curr_canvas);

        if (!WebPAnimEncoderAdd(enc, &curr_canvas, frame_timestamp, &config)) {
-          fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
+          fprintf(stderr, "Error while adding frame #%d: %s\n", frame_number,
+                  WebPAnimEncoderGetError(enc));
+          goto End;
+        } else {
+          ++frame_number;
        }

        // Update canvases.
@ -386,7 +394,7 @@ int main(int argc, const char *argv[]) {
              if (verbose) {
                fprintf(stderr, "Loop count: %d\n", loop_count);
              }
-              stored_loop_count = (loop_count != 0);
+              stored_loop_count = loop_compatibility ? (loop_count != 0) : 1;
            } else {  // An extension containing metadata.
              // We only store the first encountered chunk of each type, and
              // only if requested by the user.
@ -443,6 +451,23 @@ int main(int argc, const char *argv[]) {
    goto End;
  }

+  if (!loop_compatibility) {
+    if (!stored_loop_count) {
+      // if no loop-count element is seen, the default is '1' (loop-once)
+      // and we need to signal it explicitly in WebP. Note however that
+      // in case there's a single frame, we still don't need to store it.
+      if (frame_number > 1) {
+        stored_loop_count = 1;
+        loop_count = 1;
+      }
+    } else if (loop_count > 0) {
+      // adapt GIF's semantic to WebP's (except in the infinite-loop case)
+      loop_count += 1;
+    }
+  }
+  // loop_count of 0 is the default (infinite), so no need to signal it
+  if (loop_count == 0) stored_loop_count = 0;
+
  if (stored_loop_count || stored_icc || stored_xmp) {
    // Re-mux to add loop count and/or metadata as needed.
    mux = WebPMuxCreate(&webp_data, 1);
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -248,9 +248,9 @@ static void HandleKey(unsigned char key, int pos_x, int pos_y) {
      }
    }
  } else if (key == 'i') {
+    // Note: doesn't handle refresh of animation's last-frame (it's quite
+    // more involved to do, since you need to save the previous frame).
    kParams.print_info = 1 - kParams.print_info;
-    // TODO(skal): handle refresh of animation's last-frame too. It's quite
-    // more involved though (need to save the previous frame).
    if (!kParams.has_animation) ClearPreviousFrame();
    glutPostRedisplay();
  } else if (key == 'd') {
@ -260,8 +260,8 @@ static void HandleKey(unsigned char key, int pos_x, int pos_y) {
 }

 static void HandleReshape(int width, int height) {
-  // TODO(skal): should we preserve aspect ratio?
-  // Also: handle larger-than-screen pictures correctly.
+  // Note: reshape doesn't preserve aspect ratio, and might
+  // be handling larger-than-screen pictures incorrectly.
  glViewport(0, 0, width, height);
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
@ -378,13 +378,23 @@ static void HandleDisplay(void) {
    }
  }
  glPopMatrix();
+#if defined(__APPLE__) || defined(_WIN32)
+  glFlush();
+#else
  glutSwapBuffers();
+#endif
 }

 static void StartDisplay(void) {
  const int width = kParams.canvas_width;
  const int height = kParams.canvas_height;
+  // TODO(webp:365) GLUT_DOUBLE results in flickering / old frames to be
+  // partially displayed with animated webp + alpha.
+#if defined(__APPLE__) || defined(_WIN32)
+  glutInitDisplayMode(GLUT_RGBA);
+#else
  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
+#endif
  glutInitWindowSize(width, height);
  glutCreateWindow("WebP viewer");
  glutDisplayFunc(HandleDisplay);
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
@ -233,20 +233,20 @@ static int GetSignedBits(const uint8_t* const data, size_t data_size, size_t nb,
  return 1;
 }

-#define GET_BITS(v, n)                               \
-  do {                                               \
-    if (!GetBits(data, data_size, n, &v, bit_pos)) { \
-      LOG_ERROR("Truncated lossy bitstream.");       \
-      return WEBP_INFO_TRUNCATED_DATA;               \
-    }                                                \
+#define GET_BITS(v, n)                                 \
+  do {                                                 \
+    if (!GetBits(data, data_size, n, &(v), bit_pos)) { \
+      LOG_ERROR("Truncated lossy bitstream.");         \
+      return WEBP_INFO_TRUNCATED_DATA;                 \
+    }                                                  \
  } while (0)

-#define GET_SIGNED_BITS(v, n)                              \
-  do {                                                     \
-    if (!GetSignedBits(data, data_size, n, &v, bit_pos)) { \
-      LOG_ERROR("Truncated lossy bitstream.");             \
-      return WEBP_INFO_TRUNCATED_DATA;                     \
-    }                                                      \
+#define GET_SIGNED_BITS(v, n)                                \
+  do {                                                       \
+    if (!GetSignedBits(data, data_size, n, &(v), bit_pos)) { \
+      LOG_ERROR("Truncated lossy bitstream.");               \
+      return WEBP_INFO_TRUNCATED_DATA;                       \
+    }                                                        \
  } while (0)

 static WebPInfoStatus ParseLossySegmentHeader(const WebPInfo* const webp_info,
@ -462,12 +462,12 @@ static int LLGetBits(const uint8_t* const data, size_t data_size, size_t nb,
  return 1;
 }

-#define LL_GET_BITS(v, n)                              \
-  do {                                                 \
-    if (!LLGetBits(data, data_size, n, &v, bit_pos)) { \
-      LOG_ERROR("Truncated lossless bitstream.");      \
-      return WEBP_INFO_TRUNCATED_DATA;                 \
-    }                                                  \
+#define LL_GET_BITS(v, n)                                \
+  do {                                                   \
+    if (!LLGetBits(data, data_size, n, &(v), bit_pos)) { \
+      LOG_ERROR("Truncated lossless bitstream.");        \
+      return WEBP_INFO_TRUNCATED_DATA;                   \
+    }                                                    \
  } while (0)

 static WebPInfoStatus ParseLosslessTransform(WebPInfo* const webp_info,
@ -817,9 +817,8 @@ static WebPInfoStatus ProcessImageChunk(const ChunkData* const chunk_data,
    if (webp_info->seen_image_subchunk_) {
      LOG_ERROR("Consecutive VP8/VP8L sub-chunks in an ANMF chunk.");
      return WEBP_INFO_PARSE_ERROR;
-    } else {
-      webp_info->seen_image_subchunk_ = 1;
    }
+    webp_info->seen_image_subchunk_ = 1;
  } else {
    if (webp_info->chunk_counts_[CHUNK_VP8] ||
        webp_info->chunk_counts_[CHUNK_VP8L]) {
@ -873,9 +872,9 @@ static WebPInfoStatus ProcessALPHChunk(const ChunkData* const chunk_data,
    if (webp_info->seen_alpha_subchunk_) {
      LOG_ERROR("Consecutive ALPH sub-chunks in an ANMF chunk.");
      return WEBP_INFO_PARSE_ERROR;
-    } else {
-      webp_info->seen_alpha_subchunk_ = 1;
    }
+    webp_info->seen_alpha_subchunk_ = 1;
+
    if (webp_info->seen_image_subchunk_) {
      LOG_ERROR("ALPHA sub-chunk detected after VP8 sub-chunk "
                "in an ANMF chunk.");
@ -1107,6 +1106,7 @@ static void HelpLong(void) {
         "Note: there could be multiple input files;\n"
         "      options must come before input files.\n"
         "Options:\n"
+         "  -version ........... Print version number and exit.\n"
         "  -quiet ............. Do not show chunk parsing information.\n"
         "  -diag .............. Show parsing error diagnosis.\n"
         "  -summary ........... Show chunk stats summary.\n"
@ -1140,6 +1140,11 @@ int main(int argc, const char* argv[]) {
      show_summary = 1;
    } else if (!strcmp(argv[c], "-bitstream_info")) {
      parse_bitstream = 1;
+    } else if (!strcmp(argv[c], "-version")) {
+      const int version = WebPGetDecoderVersion();
+      printf("WebP Decoder version: %d.%d.%d\n",
+             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+      return 0;
    } else {  // Assume the remaining are all input files.
      break;
    }
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@ -1,3 +1,4 @@
+AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
 AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
 noinst_LTLIBRARIES = libwebpextras.la

@ -19,18 +20,22 @@ endif

 get_disto_SOURCES  = get_disto.c
 get_disto_CPPFLAGS = $(AM_CPPFLAGS)
-get_disto_LDADD = ../imageio/libimageio_util.la ../imageio/libimagedec.la
+get_disto_LDADD =
+get_disto_LDADD += ../imageio/libimageio_util.la
+get_disto_LDADD += ../imageio/libimagedec.la
 get_disto_LDADD += ../src/libwebp.la
 get_disto_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

 webp_quality_SOURCES  = webp_quality.c
 webp_quality_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webp_quality_LDADD  = ../imageio/libimageio_util.la
+webp_quality_LDADD =
+webp_quality_LDADD += ../imageio/libimageio_util.la
 webp_quality_LDADD += libwebpextras.la
 webp_quality_LDADD += ../src/libwebp.la

 vwebp_sdl_SOURCES  = vwebp_sdl.c webp_to_sdl.c webp_to_sdl.h
 vwebp_sdl_CPPFLAGS = $(AM_CPPFLAGS) $(SDL_INCLUDES)
-vwebp_sdl_LDADD = ../imageio/libimageio_util.la
+vwebp_sdl_LDADD =
+vwebp_sdl_LDADD += ../imageio/libimageio_util.la
 vwebp_sdl_LDADD += ../src/libwebp.la
 vwebp_sdl_LDADD += $(SDL_LIBS)
--- a/extras/extras.c
+++ b/extras/extras.c
@ -10,7 +10,7 @@
 //  Additional WebP utilities.
 //

-#include "./extras.h"
+#include "extras/extras.h"
 #include "webp/format_constants.h"

 #include <assert.h>
@ -18,7 +18,7 @@

 #define XTRA_MAJ_VERSION 0
 #define XTRA_MIN_VERSION 1
-#define XTRA_REV_VERSION 0
+#define XTRA_REV_VERSION 1

 //------------------------------------------------------------------------------

--- a/extras/extras.h
+++ b/extras/extras.h
@ -25,28 +25,28 @@ extern "C" {

 // Returns the version number of the extras library, packed in hexadecimal using
 // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetExtrasVersion(void);
+WEBP_EXTERN int WebPGetExtrasVersion(void);

 //------------------------------------------------------------------------------
 // Ad-hoc colorspace importers.

 // Import luma sample (gray scale image) into 'picture'. The 'picture'
 // width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportGray(const uint8_t* gray, WebPPicture* picture);
+WEBP_EXTERN int WebPImportGray(const uint8_t* gray, WebPPicture* picture);

 // Import rgb sample in RGB565 packed format into 'picture'. The 'picture'
 // width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);
+WEBP_EXTERN int WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);

 // Import rgb sample in RGB4444 packed format into 'picture'. The 'picture'
 // width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);
+WEBP_EXTERN int WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);

 // Import a color mapped image. The number of colors is less or equal to
 // MAX_PALETTE_SIZE. 'pic' must have been initialized. Its content, if any,
 // will be discarded. Returns 'false' in case of error, or if indexed[] contains
 // invalid indices.
-WEBP_EXTERN(int)
+WEBP_EXTERN int
 WebPImportColorMappedARGB(const uint8_t* indexed, int indexed_stride,
                          const uint32_t palette[], int palette_size,
                          WebPPicture* pic);
@ -59,7 +59,7 @@ WebPImportColorMappedARGB(const uint8_t* indexed, int indexed_stride,
 // Otherwise (lossy bitstream), the returned value is in the range [0..100].
 // Any error (invalid bitstream, animated WebP, incomplete header, etc.)
 // will return a value of -1.
-WEBP_EXTERN(int) VP8EstimateQuality(const uint8_t* const data, size_t size);
+WEBP_EXTERN int VP8EstimateQuality(const uint8_t* const data, size_t size);

 //------------------------------------------------------------------------------

--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@ -24,8 +24,8 @@
 #include <string.h>

 #include "webp/encode.h"
-#include "../imageio/image_dec.h"
-#include "../imageio/imageio_util.h"
+#include "imageio/image_dec.h"
+#include "imageio/imageio_util.h"

 static size_t ReadPicture(const char* const filename, WebPPicture* const pic,
                          int keep_alpha) {
--- a/extras/quality_estimate.c
+++ b/extras/quality_estimate.c
@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./extras.h"
+#include "extras/extras.h"
 #include "webp/decode.h"

 #include <math.h>
--- a/extras/vwebp_sdl.c
+++ b/extras/vwebp_sdl.c
@ -24,7 +24,7 @@

 #include "webp_to_sdl.h"
 #include "webp/decode.h"
-#include "../imageio/imageio_util.h"
+#include "imageio/imageio_util.h"

 #if defined(WEBP_HAVE_JUST_SDL_H)
 #include <SDL.h>
--- a/extras/webp_quality.c
+++ b/extras/webp_quality.c
@ -11,8 +11,8 @@
 #include <stdlib.h>
 #include <string.h>

-#include "./extras.h"
-#include "../imageio/imageio_util.h"
+#include "extras/extras.h"
+#include "imageio/imageio_util.h"

 int main(int argc, const char *argv[]) {
  int c;
--- a/extras/webp_to_sdl.c
+++ b/extras/webp_to_sdl.c
@ -28,6 +28,7 @@
 #include <SDL/SDL.h>
 #endif

+static int init_ok = 0;
 int WebpToSDL(const char* data, unsigned int data_size) {
  int ok = 0;
  VP8StatusCode status;
@ -42,7 +43,10 @@ int WebpToSDL(const char* data, unsigned int data_size) {
    return 1;
  }

-  SDL_Init(SDL_INIT_VIDEO);
+  if (!init_ok) {
+    SDL_Init(SDL_INIT_VIDEO);
+    init_ok = 1;
+  }

  status = WebPGetFeatures((uint8_t*)data, (size_t)data_size, &config.input);
  if (status != VP8_STATUS_OK) goto Error;
@ -97,6 +101,7 @@ int WebpToSDL(const char* data, unsigned int data_size) {
 Error:
  SDL_FreeSurface(surface);
  SDL_FreeSurface(screen);
+  WebPFreeDecBuffer(output);
  return ok;
 }

--- a/imageio/Makefile.am
+++ b/imageio/Makefile.am
@ -1,13 +1,18 @@
 AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
-noinst_LTLIBRARIES = libimageio_util.la libimagedec.la libimageenc.la
+noinst_LTLIBRARIES =
+noinst_LTLIBRARIES += libimageio_util.la
+noinst_LTLIBRARIES += libimagedec.la
+noinst_LTLIBRARIES += libimageenc.la

 noinst_HEADERS =
 noinst_HEADERS += ../src/webp/decode.h
 noinst_HEADERS += ../src/webp/types.h

-libimageio_util_la_SOURCES = imageio_util.c imageio_util.h
+libimageio_util_la_SOURCES =
+libimageio_util_la_SOURCES += imageio_util.c imageio_util.h

-libimagedec_la_SOURCES  = image_dec.c image_dec.h
+libimagedec_la_SOURCES  =
+libimagedec_la_SOURCES += image_dec.c image_dec.h
 libimagedec_la_SOURCES += jpegdec.c jpegdec.h
 libimagedec_la_SOURCES += metadata.c metadata.h
 libimagedec_la_SOURCES += pngdec.c pngdec.h
@ -18,6 +23,7 @@ libimagedec_la_SOURCES += wicdec.c wicdec.h
 libimagedec_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 libimagedec_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)

-libimageenc_la_SOURCES  = image_enc.c image_enc.h
+libimageenc_la_SOURCES  =
+libimageenc_la_SOURCES += image_enc.c image_enc.h
 libimageenc_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 libimageenc_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@ -542,22 +542,24 @@ int WebPWriteYUV(FILE* fout, const WebPDecBuffer* const buffer) {
 // Generic top-level call

 int WebPSaveImage(const WebPDecBuffer* const buffer,
-                  WebPOutputFileFormat format, const char* const out_file) {
+                  WebPOutputFileFormat format,
+                  const char* const out_file_name) {
  FILE* fout = NULL;
  int needs_open_file = 1;
-  const int use_stdout = (out_file != NULL) && !strcmp(out_file, "-");
+  const int use_stdout = (out_file_name != NULL) && !strcmp(out_file_name, "-");
  int ok = 1;

-  if (buffer == NULL || out_file == NULL) return 0;
+  if (buffer == NULL || out_file_name == NULL) return 0;

 #ifdef HAVE_WINCODEC_H
  needs_open_file = (format != PNG);
 #endif

  if (needs_open_file) {
-    fout = use_stdout ? ImgIoUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
+    fout = use_stdout ? ImgIoUtilSetBinaryMode(stdout)
+                      : fopen(out_file_name, "wb");
    if (fout == NULL) {
-      fprintf(stderr, "Error opening output file %s\n", out_file);
+      fprintf(stderr, "Error opening output file %s\n", out_file_name);
      return 0;
    }
  }
@ -566,7 +568,7 @@ int WebPSaveImage(const WebPDecBuffer* const buffer,
      format == RGBA || format == BGRA || format == ARGB ||
      format == rgbA || format == bgrA || format == Argb) {
 #ifdef HAVE_WINCODEC_H
-    ok &= WebPWritePNG(out_file, use_stdout, buffer);
+    ok &= WebPWritePNG(out_file_name, use_stdout, buffer);
 #else
    ok &= WebPWritePNG(fout, buffer);
 #endif
--- a/imageio/imageio_util.c
+++ b/imageio/imageio_util.c
@ -137,7 +137,11 @@ void ImgIoUtilCopyPlane(const uint8_t* src, int src_stride,

 int ImgIoUtilCheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
  const uint64_t total_size = nmemb * size;
-  return (total_size == (size_t)total_size);
+  int ok = (total_size == (size_t)total_size);
+#if defined(WEBP_MAX_IMAGE_SIZE)
+  ok = ok && (total_size <= (uint64_t)WEBP_MAX_IMAGE_SIZE);
+#endif
+  return ok;
 }

 // -----------------------------------------------------------------------------
--- a/imageio/jpegdec.c
+++ b/imageio/jpegdec.c
@ -304,18 +304,18 @@ int ReadJPEG(const uint8_t* const data, size_t data_size,

  if (stride != (int)stride ||
      !ImgIoUtilCheckSizeArgumentsOverflow(stride, height)) {
-    goto End;
+    goto Error;
  }

  rgb = (uint8_t*)malloc((size_t)stride * height);
  if (rgb == NULL) {
-    goto End;
+    goto Error;
  }
  buffer[0] = (JSAMPLE*)rgb;

  while (dinfo.output_scanline < dinfo.output_height) {
    if (jpeg_read_scanlines((j_decompress_ptr)&dinfo, buffer, 1) != 1) {
-      goto End;
+      goto Error;
    }
    buffer[0] += stride;
  }
--- a/imageio/pnmdec.c
+++ b/imageio/pnmdec.c
@ -117,8 +117,13 @@ static size_t ReadPAMFields(PNMInfo* const info, size_t off) {
    }
  }
  if (!(info->seen_flags & TUPLE_FLAG)) {
-    info->seen_flags |= TUPLE_FLAG;
-    info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
+    if (info->depth > 0 && info->depth <= 4) {
+      info->seen_flags |= TUPLE_FLAG;
+      info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
+    } else {
+      fprintf(stderr, "PAM: invalid bitdepth (%d).\n", info->depth);
+      return 0;
+    }
  }
  if (info->seen_flags != ALL_NEEDED_FLAGS) {
    fprintf(stderr, "PAM: incomplete header.\n");
--- a/imageio/webpdec.c
+++ b/imageio/webpdec.c
@ -141,10 +141,21 @@ int ReadWebP(const uint8_t* const data, size_t data_size,

  do {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
+    uint64_t stride;
    pic->width = bitstream->width;
    pic->height = bitstream->height;
-    if (!pic->use_argb) pic->colorspace = has_alpha ? WEBP_YUV420A
-                                                    : WEBP_YUV420;
+    if (pic->use_argb) {
+      stride = (uint64_t)bitstream->width * 4;
+    } else {
+      stride = (uint64_t)bitstream->width * (has_alpha ? 5 : 3) / 2;
+      pic->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+    }
+
+    if (!ImgIoUtilCheckSizeArgumentsOverflow(stride, bitstream->height)) {
+      status = VP8_STATUS_OUT_OF_MEMORY;
+      break;
+    }
+
    ok = WebPPictureAlloc(pic);
    if (!ok) {
      status = VP8_STATUS_OUT_OF_MEMORY;
--- a/makefile.unix
+++ b/makefile.unix
@ -34,6 +34,16 @@ else
  GL_LIBS = -lglut -lGL
 endif

+# SDL flags: use sdl-config if it exists
+SDL_CONFIG = $(shell sdl-config --version 2> /dev/null)
+ifneq ($(SDL_CONFIG),)
+  SDL_LIBS = $(shell sdl-config --libs)
+  SDL_FLAGS = $(shell sdl-config --cflags)
+else
+  # use best-guess
+  SDL_LIBS = -lSDL
+  SDL_FLAGS =
+endif

 # To install libraries on Mac OS X:
 # 1. Install MacPorts (http://www.macports.org/install.php)
@ -57,7 +67,7 @@ endif
 # EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES

 # Extra flags to enable byte swap for 16 bit colorspaces.
-# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP
+# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP=1

 # Extra flags to enable multi-threading
 EXTRA_FLAGS += -DWEBP_USE_THREAD
@ -103,7 +113,7 @@ endif

 AR = ar
 ARFLAGS = r
-CPPFLAGS = -Isrc/ -Wall
+CPPFLAGS = -I. -Isrc/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
 CC = gcc
 INSTALL = install
@ -173,9 +183,6 @@ DSP_DEC_OBJS = \
    src/dsp/yuv_sse2.o \

 DSP_ENC_OBJS = \
-    src/dsp/argb.o \
-    src/dsp/argb_mips_dsp_r2.o \
-    src/dsp/argb_sse2.o \
    src/dsp/cost.o \
    src/dsp/cost_mips32.o \
    src/dsp/cost_mips_dsp_r2.o \
@ -335,7 +342,8 @@ OUT_LIBS += src/libwebp.a
 EXTRA_LIB = extras/libwebpextras.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
 EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux \
-                 examples/anim_diff examples/img2webp examples/webpinfo
+                 examples/anim_diff examples/anim_dump \
+                 examples/img2webp examples/webpinfo
 OTHER_EXAMPLES = extras/get_disto extras/webp_quality extras/vwebp_sdl

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
@ -363,7 +371,7 @@ src/utils/bit_reader_utils.o: src/utils/endian_inl_utils.h
 src/utils/bit_writer_utils.o: src/utils/endian_inl_utils.h

 %.o: %.c $(HDRS)
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@
+	$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@

 examples/libanim_util.a: $(ANIM_UTIL_OBJS)
 examples/libexample_util.a: $(EX_UTIL_OBJS)
@ -381,6 +389,7 @@ src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)
 	$(AR) $(ARFLAGS) $@ $^

 examples/anim_diff: examples/anim_diff.o $(ANIM_UTIL_OBJS) $(GIFDEC_OBJS)
+examples/anim_dump: examples/anim_dump.o $(ANIM_UTIL_OBJS)
 examples/cwebp: examples/cwebp.o
 examples/dwebp: examples/dwebp.o
 examples/gif2webp: examples/gif2webp.o $(GIFDEC_OBJS)
@ -394,6 +403,13 @@ examples/anim_diff: src/demux/libwebpdemux.a examples/libexample_util.a
 examples/anim_diff: imageio/libimageio_util.a src/libwebp.a
 examples/anim_diff: EXTRA_LIBS += $(GIF_LIBS)
 examples/anim_diff: EXTRA_FLAGS += -DWEBP_HAVE_GIF
+examples/anim_dump: examples/libanim_util.a
+examples/anim_dump: src/demux/libwebpdemux.a
+examples/anim_dump: examples/libexample_util.a
+examples/anim_dump: imageio/libimageio_util.a
+examples/anim_dump: imageio/libimageenc.a
+examples/anim_dump: src/libwebp.a
+examples/anim_dump: EXTRA_LIBS += $(GIF_LIBS) $(DWEBP_LIBS)
 examples/cwebp: examples/libexample_util.a
 examples/cwebp: imageio/libimagedec.a
 examples/cwebp: imageio/libimageio_util.a
@ -434,8 +450,8 @@ extras/vwebp_sdl: extras/vwebp_sdl.o
 extras/vwebp_sdl: extras/webp_to_sdl.o
 extras/vwebp_sdl: imageio/libimageio_util.a
 extras/vwebp_sdl: src/libwebp.a
-extras/vwebp_sdl: EXTRA_FLAGS += -DWEBP_HAVE_SDL
-extras/vwebp_sdl: EXTRA_LIBS += -lSDL
+extras/vwebp_sdl: EXTRA_FLAGS += -DWEBP_HAVE_SDL $(SDL_FLAGS)
+extras/vwebp_sdl: EXTRA_LIBS += $(SDL_LIBS)

 $(OUT_EXAMPLES) $(EXTRA_EXAMPLES) $(OTHER_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "January 25, 2017"
+.TH GIF2WEBP 1 "September 20, 2017"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -109,6 +109,9 @@ the range of 20 to 50.
 .TP
 .B \-mt
 Use multi-threading for encoding, if possible.
+.B \-loop_compatibility
+If enabled, handle the loop information in a compatible fashion for Chrome
+version prior to M62 (inclusive) and Firefox.
 .TP
 .B \-v
 Print extra information.
--- a/man/webpinfo.1
+++ b/man/webpinfo.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPINFO 1 "May 08, 2017"
+.TH WEBPINFO 1 "November 24, 2017"
 .SH NAME
 webpinfo \- print out the chunk level structure of WebP files
 along with basic integrity checks.
@ -22,16 +22,19 @@ WebP format.

 .SH OPTIONS
 .TP
-.B -quiet
+.B \-version
+Print the version number (as major.minor.revision) and exit.
+.TP
+.B \-quiet
 Do not show chunk parsing information.
 .TP
-.B -diag
+.B \-diag
 Show parsing error diagnosis.
 .TP
-.B -summary
+.B \-summary
 Show chunk stats summary.
 .TP
-.BI -bitstream_info
+.BI \-bitstream_info
 Parse bitstream header.
 .TP
 .B \-h, \-help
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -22,6 +22,7 @@ commondir = $(includedir)/webp
 libwebp_la_SOURCES =
 libwebpinclude_HEADERS =
 libwebpinclude_HEADERS += webp/encode.h
+
 noinst_HEADERS =
 noinst_HEADERS += webp/format_constants.h

@ -35,7 +36,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 7:0:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 7:1:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -47,7 +48,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 3:0:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 3:1:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -1,3 +1,4 @@
+AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
 noinst_LTLIBRARIES = libwebpdecode.la

 libwebpdecode_la_SOURCES =
--- a/src/dec/alpha_dec.c
+++ b/src/dec/alpha_dec.c
@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "./alphai_dec.h"
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "../dsp/dsp.h"
-#include "../utils/quant_levels_dec_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/quant_levels_dec_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"

 //------------------------------------------------------------------------------
 // ALPHDecoder object.
--- a/src/dec/alphai_dec.h
+++ b/src/dec/alphai_dec.h
@ -11,11 +11,11 @@
 //
 // Author: Urvang (urvang@google.com)

-#ifndef WEBP_DEC_ALPHAI_H_
-#define WEBP_DEC_ALPHAI_H_
+#ifndef WEBP_DEC_ALPHAI_DEC_H_
+#define WEBP_DEC_ALPHAI_DEC_H_

-#include "./webpi_dec.h"
-#include "../utils/filters_utils.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/filters_utils.h"

 #ifdef __cplusplus
 extern "C" {
@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_ALPHAI_H_ */
+#endif  /* WEBP_DEC_ALPHAI_DEC_H_ */
--- a/src/dec/buffer_dec.c
+++ b/src/dec/buffer_dec.c
@ -13,15 +13,15 @@

 #include <stdlib.h>

-#include "./vp8i_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------
 // WebPDecBuffer

 // Number of bytes per pixel for the different color-spaces.
-static const int kModeBpp[MODE_LAST] = {
+static const uint8_t kModeBpp[MODE_LAST] = {
  3, 4, 3, 4, 4, 2, 2,
  4, 4, 4, 2,    // pre-multiplied modes
  1, 1 };
@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
 // strictly speaking, the very last (or first, if flipped) row
 // doesn't require padding.
 #define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
-    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+    ((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))

 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
@ -98,9 +98,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
    uint64_t uv_size = 0, a_size = 0, total_size;
    // We need memory and it hasn't been allocated yet.
    // => initialize output buffer, now that dimensions are known.
-    const int stride = w * kModeBpp[mode];
-    const uint64_t size = (uint64_t)stride * h;
+    int stride;
+    uint64_t size;

+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
+    stride = w * kModeBpp[mode];
+    size = (uint64_t)stride * h;
    if (!WebPIsRGBMode(mode)) {
      uv_stride = (w + 1) / 2;
      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
@ -169,11 +174,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
  return VP8_STATUS_OK;
 }

-VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                    const WebPDecoderOptions* const options,
-                                    WebPDecBuffer* const out) {
+                                    WebPDecBuffer* const buffer) {
  VP8StatusCode status;
-  if (out == NULL || w <= 0 || h <= 0) {
+  if (buffer == NULL || width <= 0 || height <= 0) {
    return VP8_STATUS_INVALID_PARAM;
  }
  if (options != NULL) {    // First, apply options if there is any.
@ -182,33 +187,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
      const int ch = options->crop_height;
      const int x = options->crop_left & ~1;
      const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
+          x + cw > width || y + ch > height) {
        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
      }
-      w = cw;
-      h = ch;
+      width = cw;
+      height = ch;
    }
+
    if (options->use_scaling) {
+#if !defined(WEBP_REDUCE_SIZE)
      int scaled_width = options->scaled_width;
      int scaled_height = options->scaled_height;
      if (!WebPRescalerGetScaledDimensions(
-              w, h, &scaled_width, &scaled_height)) {
+              width, height, &scaled_width, &scaled_height)) {
        return VP8_STATUS_INVALID_PARAM;
      }
-      w = scaled_width;
-      h = scaled_height;
+      width = scaled_width;
+      height = scaled_height;
+#else
+      return VP8_STATUS_INVALID_PARAM;   // rescaling not supported
+#endif
    }
  }
-  out->width = w;
-  out->height = h;
+  buffer->width = width;
+  buffer->height = height;

  // Then, allocate buffer for real.
-  status = AllocateBuffer(out);
+  status = AllocateBuffer(buffer);
  if (status != VP8_STATUS_OK) return status;

  // Use the stride trick if vertical flip is needed.
  if (options != NULL && options->flip) {
-    status = WebPFlipBuffer(out);
+    status = WebPFlipBuffer(buffer);
  }
  return status;
 }
--- a/src/dec/common_dec.h
+++ b/src/dec/common_dec.h
@ -11,8 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_DEC_COMMON_H_
-#define WEBP_DEC_COMMON_H_
+#ifndef WEBP_DEC_COMMON_DEC_H_
+#define WEBP_DEC_COMMON_DEC_H_

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
       NUM_PROBAS = 11
     };

-#endif    // WEBP_DEC_COMMON_H_
+#endif    // WEBP_DEC_COMMON_DEC_H_
--- a/src/dec/frame_dec.c
+++ b/src/dec/frame_dec.c
@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "./vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------
 // Main reconstruction function.

-static const int kScan[16] = {
+static const uint16_t kScan[16] = {
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 #define MIN_DITHER_AMP 4

 #define DITHER_AMP_TAB_SIZE 12
-static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
  // roughly, it's dqm->uv_mat_[1]
  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
 };
@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  }

  mem = (uint8_t*)dec->mem_;
-  dec->intra_t_ = (uint8_t*)mem;
+  dec->intra_t_ = mem;
  mem += intra_pred_mode_size;

  dec->yuv_t_ = (VP8TopSamples*)mem;
@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {

  mem = (uint8_t*)WEBP_ALIGN(mem);
  assert((yuv_size & WEBP_ALIGN_CST) == 0);
-  dec->yuv_b_ = (uint8_t*)mem;
+  dec->yuv_b_ = mem;
  mem += yuv_size;

  dec->mb_data_ = (VP8MBData*)mem;
@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
    const int extra_rows = kFilterExtraRows[dec->filter_type_];
    const int extra_y = extra_rows * dec->cache_y_stride_;
    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
-    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_y_ = mem + extra_y;
    dec->cache_u_ = dec->cache_y_
                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
    dec->cache_v_ = dec->cache_u_
@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += cache_size;

  // alpha plane
-  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  dec->alpha_plane_ = alpha_size ? mem : NULL;
  mem += alpha_size;
  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

--- a/src/dec/idec_dec.c
+++ b/src/dec/idec_dec.c
@ -15,10 +15,10 @@
 #include <string.h>
 #include <stdlib.h>

-#include "./alphai_dec.h"
-#include "./webpi_dec.h"
-#include "./vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"

 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@ -673,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Wrapper toward WebPINewDecoder

-WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
+WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
                          size_t output_buffer_size, int output_stride) {
  const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
  WebPIDecoder* idec;

-  if (mode >= MODE_YUV) return NULL;
+  if (csp >= MODE_YUV) return NULL;
  if (is_external_memory == 0) {    // Overwrite parameters to sane values.
    output_buffer_size = 0;
    output_stride = 0;
@ -689,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
  }
  idec = WebPINewDecoder(NULL);
  if (idec == NULL) return NULL;
-  idec->output_.colorspace = mode;
+  idec->output_.colorspace = csp;
  idec->output_.is_external_memory = is_external_memory;
  idec->output_.u.RGBA.rgba = output_buffer;
  idec->output_.u.RGBA.stride = output_stride;
--- a/src/dec/io_dec.c
+++ b/src/dec/io_dec.c
@ -13,11 +13,11 @@

 #include <assert.h>
 #include <stdlib.h>
-#include "../dec/vp8i_dec.h"
-#include "./webpi_dec.h"
-#include "../dsp/dsp.h"
-#include "../dsp/yuv.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/yuv.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
    int num_rows;
    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
    uint8_t* alpha_dst = base_rgba;
 #else
    uint8_t* alpha_dst = base_rgba + 1;
@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
 //------------------------------------------------------------------------------
 // YUV rescaling (no final RGB conversion needed)

+#if !defined(WEBP_REDUCE_SIZE)
 static int Rescale(const uint8_t* src, int src_stride,
                   int new_lines, WebPRescaler* const wrk) {
  int num_lines_out = 0;
@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                               int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
  uint8_t* alpha_dst = base_rgba;
 #else
  uint8_t* alpha_dst = base_rgba + 1;
@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  return 1;
 }

+#endif  // WEBP_REDUCE_SIZE
+
 //------------------------------------------------------------------------------
 // Default custom functions

@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) {
    WebPInitUpsamplers();
  }
  if (io->use_scaling) {
+#if !defined(WEBP_REDUCE_SIZE)
    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
    if (!ok) {
      return 0;    // memory error
    }
+#else
+    return 0;   // rescaling support not compiled
+#endif
  } else {
    if (is_rgb) {
      WebPInitSamplers();
@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) {
    }
  }

-  if (is_rgb) {
-    VP8YUVInit();
-  }
  return 1;
 }

--- a/src/dec/quant_dec.c
+++ b/src/dec/quant_dec.c
@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./vp8i_dec.h"
+#include "src/dec/vp8i_dec.h"

 static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
--- a/src/dec/tree_dec.c
+++ b/src/dec/tree_dec.c
@ -11,15 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./vp8i_dec.h"
-#include "../utils/bit_reader_inl_utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/bit_reader_inl_utils.h"

+#if !defined(USE_GENERIC_TREE)
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
-#define USE_GENERIC_TREE
+#define USE_GENERIC_TREE 1   // ALTERNATE_CODE
+#else
+#define USE_GENERIC_TREE 0
 #endif
+#endif  // USE_GENERIC_TREE

-#ifdef USE_GENERIC_TREE
+#if (USE_GENERIC_TREE == 1)
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
    -B_TM_PRED, 2,
@ -317,7 +321,7 @@ static void ParseIntraMode(VP8BitReader* const br,
      int x;
      for (x = 0; x < 4; ++x) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
-#ifdef USE_GENERIC_TREE
+#if (USE_GENERIC_TREE == 1)
        // Generic tree-parsing
        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
        while (i > 0) {
@ -335,7 +339,7 @@ static void ParseIntraMode(VP8BitReader* const br,
                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
-#endif    // USE_GENERIC_TREE
+#endif  // USE_GENERIC_TREE
        top[x] = ymode;
      }
      memcpy(modes, top, 4 * sizeof(*top));
@ -498,7 +502,7 @@ static const uint8_t

 // Paragraph 9.9

-static const int kBands[16 + 1] = {
+static const uint8_t kBands[16 + 1] = {
  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  0  // extra entry as sentinel
 };
--- a/src/dec/vp8_dec.c
+++ b/src/dec/vp8_dec.c
@ -13,12 +13,12 @@

 #include <stdlib.h>

-#include "./alphai_dec.h"
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/bit_reader_inl_utils.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/bit_reader_inl_utils.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------

--- a/src/dec/vp8_dec.h
+++ b/src/dec/vp8_dec.h
@ -11,10 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_WEBP_DECODE_VP8_H_
-#define WEBP_WEBP_DECODE_VP8_H_
+#ifndef WEBP_DEC_VP8_DEC_H_
+#define WEBP_DEC_VP8_DEC_H_

-#include "../webp/decode.h"
+#include "src/webp/decode.h"

 #ifdef __cplusplus
 extern "C" {
@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
 // Miscellaneous VP8/VP8L bitstream probing functions.

 // Returns true if the next 3 bytes in data contain the VP8 signature.
-WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);

 // Validates the VP8 data-header and retrieves basic header information viz
 // width and height. Returns 0 in case of formatting error. *width/*height
 // can be passed NULL.
-WEBP_EXTERN(int) VP8GetInfo(
+WEBP_EXTERN int VP8GetInfo(
    const uint8_t* data,
    size_t data_size,    // data available so far
    size_t chunk_size,   // total data size expected in the chunk
    int* const width, int* const height);

 // Returns true if the next byte(s) in data is a VP8L signature.
-WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);

 // Validates the VP8L data-header and retrieves basic header information viz
 // width, height and alpha. Returns 0 in case of formatting error.
 // width/height/has_alpha can be passed NULL.
-WEBP_EXTERN(int) VP8LGetInfo(
+WEBP_EXTERN int VP8LGetInfo(
    const uint8_t* data, size_t data_size,  // data available so far
    int* const width, int* const height, int* const has_alpha);

@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo(
 }    // extern "C"
 #endif

-#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
+#endif  /* WEBP_DEC_VP8_DEC_H_ */
--- a/src/dec/vp8i_dec.h
+++ b/src/dec/vp8i_dec.h
@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_DEC_VP8I_H_
-#define WEBP_DEC_VP8I_H_
+#ifndef WEBP_DEC_VP8I_DEC_H_
+#define WEBP_DEC_VP8I_DEC_H_

 #include <string.h>     // for memcpy()
-#include "./common_dec.h"
-#include "./vp8li_dec.h"
-#include "../utils/bit_reader_utils.h"
-#include "../utils/random_utils.h"
-#include "../utils/thread_utils.h"
-#include "../dsp/dsp.h"
+#include "src/dec/common_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/random_utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/dsp/dsp.h"

 #ifdef __cplusplus
 extern "C" {
@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 6
-#define DEC_REV_VERSION 0
+#define DEC_REV_VERSION 1

 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
@ -57,7 +57,6 @@ extern "C" {
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
 #define YUV_SIZE (BPS * 17 + BPS * 9)
-#define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_VP8I_H_ */
+#endif  /* WEBP_DEC_VP8I_DEC_H_ */
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@ -14,22 +14,22 @@

 #include <stdlib.h>

-#include "./alphai_dec.h"
-#include "./vp8li_dec.h"
-#include "../dsp/dsp.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../dsp/yuv.h"
-#include "../utils/endian_inl_utils.h"
-#include "../utils/huffman_utils.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/huffman_utils.h"
+#include "src/utils/utils.h"

 #define NUM_ARGB_CACHE_ROWS          16

 static const int kCodeLengthLiterals = 16;
 static const int kCodeLengthRepeatCode = 16;
-static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
-static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
+static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
+static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };

 // -----------------------------------------------------------------------------
 //  Five Huffman codes are used at each meta code:
@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
 // http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
-static const int kTableSize[12] = {
+static const uint16_t kTableSize[12] = {
  FIXED_TABLE_SIZE + 654,
  FIXED_TABLE_SIZE + 656,
  FIXED_TABLE_SIZE + 658,
@ -485,6 +485,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 //------------------------------------------------------------------------------
 // Scaling.

+#if !defined(WEBP_REDUCE_SIZE)
 static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
  const int num_channels = 4;
  const int in_width = io->mb_w;
@ -516,10 +517,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
                   out_width, out_height, 0, num_channels, work);
  return 1;
 }
+#endif   // WEBP_REDUCE_SIZE

 //------------------------------------------------------------------------------
 // Export to ARGB

+#if !defined(WEBP_REDUCE_SIZE)
+
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                  int rgba_stride, uint8_t* const rgba) {
@ -561,6 +565,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
  return num_lines_out;
 }

+#endif   // WEBP_REDUCE_SIZE
+
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
                    const uint8_t* row_in, int in_stride,
@ -746,9 +752,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
      if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
        const WebPRGBABuffer* const buf = &output->u.RGBA;
        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
-        const int num_rows_out = io->use_scaling ?
+        const int num_rows_out =
+#if !defined(WEBP_REDUCE_SIZE)
+         io->use_scaling ?
            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
                                 rgba, buf->stride) :
+#endif  // WEBP_REDUCE_SIZE
            EmitRows(output->colorspace, rows_data, in_stride,
                     io->mb_w, io->mb_h, rgba, buf->stride);
        // Update 'last_out_row_'.
@ -1632,12 +1641,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

    if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;

+#if !defined(WEBP_REDUCE_SIZE)
    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;

    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
      // need the alpha-multiply functions for premultiplied output or rescaling
      WebPInitAlphaProcessing();
    }
+#else
+    if (io->use_scaling) {
+      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      goto Err;
+    }
+#endif
    if (!WebPIsRGBMode(dec->output_->colorspace)) {
      WebPInitConvertARGBToYUV();
      if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
--- a/src/dec/vp8li_dec.h
+++ b/src/dec/vp8li_dec.h
@ -12,14 +12,14 @@
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora(vikaas.arora@gmail.com)

-#ifndef WEBP_DEC_VP8LI_H_
-#define WEBP_DEC_VP8LI_H_
+#ifndef WEBP_DEC_VP8LI_DEC_H_
+#define WEBP_DEC_VP8LI_DEC_H_

 #include <string.h>     // for memcpy()
-#include "./webpi_dec.h"
-#include "../utils/bit_reader_utils.h"
-#include "../utils/color_cache_utils.h"
-#include "../utils/huffman_utils.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/huffman_utils.h"

 #ifdef __cplusplus
 extern "C" {
@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_VP8LI_H_ */
+#endif  /* WEBP_DEC_VP8LI_DEC_H_ */
--- a/src/dec/webp_dec.c
+++ b/src/dec/webp_dec.c
@ -13,11 +13,11 @@

 #include <stdlib.h>

-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/utils.h"
-#include "../webp/mux_types.h"  // ALPHA_FLAG
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/utils.h"
+#include "src/webp/mux_types.h"  // ALPHA_FLAG

 //------------------------------------------------------------------------------
 // RIFF layout is:
@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
                                NULL, NULL, NULL, &has_animation,
                                NULL, headers);
  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    // TODO(jzern): full support of animation frames will require API additions.
+    // The WebPDemux API + libwebp can be used to decode individual
+    // uncomposited frames or the WebPAnimDecoder can be used to fully
+    // reconstruct them (see webp/demux.h).
    if (has_animation) {
      status = VP8_STATUS_UNSUPPORTED_FEATURE;
    }
--- a/src/dec/webpi_dec.h
+++ b/src/dec/webpi_dec.h
@ -11,15 +11,15 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#ifndef WEBP_DEC_WEBPI_H_
-#define WEBP_DEC_WEBPI_H_
+#ifndef WEBP_DEC_WEBPI_DEC_H_
+#define WEBP_DEC_WEBPI_DEC_H_

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "../utils/rescaler_utils.h"
-#include "./vp8_dec.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/dec/vp8_dec.h"

 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_WEBPI_H_ */
+#endif  /* WEBP_DEC_WEBPI_DEC_H_ */
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -1,3 +1,4 @@
+AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
 lib_LTLIBRARIES = libwebpdemux.la

 libwebpdemux_la_SOURCES =
@ -9,6 +10,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:2:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:3:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/anim_decode.c
+++ b/src/demux/anim_decode.c
@ -11,15 +11,15 @@
 //

 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif

 #include <assert.h>
 #include <string.h>

-#include "../utils/utils.h"
-#include "../webp/decode.h"
-#include "../webp/demux.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/demux.h"

 #define NUM_CHANNELS 4

--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -11,21 +11,21 @@
 //

 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif

 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>

-#include "../utils/utils.h"
-#include "../webp/decode.h"     // WebPGetFeatures
-#include "../webp/demux.h"
-#include "../webp/format_constants.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"     // WebPGetFeatures
+#include "src/webp/demux.h"
+#include "src/webp/format_constants.h"

 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 2
+#define DMUX_REV_VERSION 3

 typedef struct {
  size_t start_;        // start location of the data
@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size,
  frame->complete_ = complete;
 }

-// Store image bearing chunks to 'frame'.
+// Store image bearing chunks to 'frame'. 'min_size' is an optional size
+// requirement, it may be zero.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
                              MemBuffer* const mem, Frame* const frame) {
  int alpha_chunks = 0;
  int image_chunks = 0;
-  int done = (MemDataSize(mem) < min_size);
+  int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
+              MemDataSize(mem) < min_size);
  ParseStatus status = PARSE_OK;

  if (done) return PARSE_NEED_MORE_DATA;
@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
  frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
  if (frame == NULL) return PARSE_ERROR;

-  // For the single image case we allow parsing of a partial frame, but we need
-  // at least CHUNK_HEADER_SIZE for parsing.
-  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
+  // For the single image case we allow parsing of a partial frame, so no
+  // minimum size is imposed here.
+  status = StoreFrame(1, 0, &dmux->mem_, frame);
  if (status != PARSE_ERROR) {
    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
    // Clear any alpha when the alpha flag is missing.
--- a/src/demux/libwebpdemux.rc
+++ b/src/demux/libwebpdemux.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,3,0,2
- PRODUCTVERSION 0,3,0,2
+ FILEVERSION 0,3,0,3
+ PRODUCTVERSION 0,3,0,3
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebpdemux DLL"
-            VALUE "FileVersion", "0.3.2"
+            VALUE "FileVersion", "0.3.3"
            VALUE "InternalName", "libwebpdemux.dll"
            VALUE "LegalCopyright", "Copyright (C) 2017"
            VALUE "OriginalFilename", "libwebpdemux.dll"
            VALUE "ProductName", "WebP Image Demuxer"
-            VALUE "ProductVersion", "0.3.2"
+            VALUE "ProductVersion", "0.3.3"
        END
    END
    BLOCK "VarFileInfo"
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -1,9 +1,15 @@
-noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
-noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
-noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
-noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
-noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
-noinst_LTLIBRARIES += libwebpdspdecode_wasm.la
+AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
+noinst_LTLIBRARIES =
+noinst_LTLIBRARIES += libwebpdsp.la
+noinst_LTLIBRARIES += libwebpdsp_avx2.la
+noinst_LTLIBRARIES += libwebpdsp_sse2.la
+noinst_LTLIBRARIES += libwebpdspdecode_sse2.la
+noinst_LTLIBRARIES += libwebpdsp_sse41.la
+noinst_LTLIBRARIES += libwebpdspdecode_sse41.la
+noinst_LTLIBRARIES += libwebpdsp_neon.la
+noinst_LTLIBRARIES += libwebpdspdecode_neon.la
+noinst_LTLIBRARIES += libwebpdsp_msa.la
+noinst_LTLIBRARIES += libwebpdspdecode_msa.la

 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@ -40,8 +46,6 @@ COMMON_SOURCES += yuv_mips32.c
 COMMON_SOURCES += yuv_mips_dsp_r2.c

 ENC_SOURCES =
-ENC_SOURCES += argb.c
-ENC_SOURCES += argb_mips_dsp_r2.c
 ENC_SOURCES += cost.c
 ENC_SOURCES += cost_mips32.c
 ENC_SOURCES += cost_mips_dsp_r2.c
@ -97,12 +101,7 @@ libwebpdspdecode_msa_la_SOURCES += upsampling_msa.c
 libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
 libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)

-# WASM is not fully integrated into configure; the addition here keeps source
-# extraction by cmake simple.
-libwebpdspdecode_wasm_la_SOURCES = dec_wasm.c
-
 libwebpdsp_sse2_la_SOURCES =
-libwebpdsp_sse2_la_SOURCES += argb_sse2.c
 libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
 libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
@ -143,7 +142,8 @@ libwebpdsp_la_CPPFLAGS += $(AM_CPPFLAGS)
 libwebpdsp_la_CPPFLAGS += $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
 libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_LIBADD =
-libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
+libwebpdsp_la_LIBADD += libwebpdsp_avx2.la
+libwebpdsp_la_LIBADD += libwebpdsp_sse2.la
 libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
 libwebpdsp_la_LIBADD += libwebpdsp_neon.la
 libwebpdsp_la_LIBADD += libwebpdsp_msa.la
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -12,10 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 // Tables can be faster on some platform but incur some extra binary size (~2k).
-// #define USE_TABLES_FOR_ALPHA_MULT
+#if !defined(USE_TABLES_FOR_ALPHA_MULT)
+#define USE_TABLES_FOR_ALPHA_MULT 0   // ALTERNATE_CODE
+#endif
+

 // -----------------------------------------------------------------------------

@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
  return v;
 }

-#ifdef USE_TABLES_FOR_ALPHA_MULT
+#if (USE_TABLES_FOR_ALPHA_MULT == 1)

 static const uint32_t kMultTables[2][256] = {
  {    // (255u << MFIX) / alpha
@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
  return inverse ? (255u << MFIX) / a : a * KINV_255;
 }

-#endif    // USE_TABLES_FOR_ALPHA_MULT
+#endif  // USE_TABLES_FOR_ALPHA_MULT

-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t argb = ptr[x];
@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
  }
 }

-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
-                  int width, int inverse) {
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+                   int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t a = alpha[x];
@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif

-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
-                               int w, int h, int stride) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
+                                 int w, int h, int stride) {
  while (h-- > 0) {
    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
    rgba += stride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 #undef MULTIPLIER
 #undef PREMULTIPLY

@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
  return (x * m) >> 16;
 }

-static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
-                                               int w, int h, int stride,
-                                               int rg_byte_pos /* 0 or 1 */) {
+static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
+                                                 int w, int h, int stride,
+                                                 int rg_byte_pos /* 0 or 1 */) {
  while (h-- > 0) {
    int i;
    for (i = 0; i < w; ++i) {
@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
 }
 #undef MULTIPLIER

-static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
-                                   int w, int h, int stride) {
-#ifdef WEBP_SWAP_16BIT_CSP
-  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
+                                     int w, int h, int stride) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
 #else
-  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
 #endif
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
                           int width, int height,
                           uint8_t* dst, int dst_stride) {
@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
  int i;
  for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+
+static int HasAlpha8b_C(const uint8_t* src, int length) {
+  while (length-- > 0) if (*src++ != 0xff) return 1;
+  return 0;
+}
+
+static int HasAlpha32b_C(const uint8_t* src, int length) {
+  int x;
+  for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Simple channel manipulations.
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                      int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}

 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out);
+
+int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+int (*WebPHasAlpha32b)(const uint8_t* src, int length);

 //------------------------------------------------------------------------------
 // Init function
@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;

-  WebPMultARGBRow = WebPMultARGBRowC;
-  WebPMultRow = WebPMultRowC;
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
-  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPMultARGBRow = WebPMultARGBRow_C;
+  WebPMultRow = WebPMultRow_C;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;

+  WebPPackRGB = PackRGB_C;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
  WebPDispatchAlpha = DispatchAlpha_C;
  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
  WebPExtractAlpha = ExtractAlpha_C;
  WebPExtractGreen = ExtractGreen_C;
+#endif
+
+  WebPHasAlpha8b = HasAlpha8b_C;
+  WebPHasAlpha32b = HasAlpha32b_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitAlphaProcessingNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitAlphaProcessingMIPSdspR2();
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitAlphaProcessingNEON();
+  }
+#endif
+
+  assert(WebPMultARGBRow != NULL);
+  assert(WebPMultRow != NULL);
+  assert(WebPApplyAlphaMultiply != NULL);
+  assert(WebPApplyAlphaMultiply4444 != NULL);
+  assert(WebPDispatchAlpha != NULL);
+  assert(WebPDispatchAlphaToGreen != NULL);
+  assert(WebPExtractAlpha != NULL);
+  assert(WebPExtractGreen != NULL);
+  assert(WebPPackRGB != NULL);
+  assert(WebPHasAlpha8b != NULL);
+  assert(WebPHasAlpha32b != NULL);
+
  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/src/dsp/alpha_processing_mips_dsp_r2.c
@ -12,13 +12,13 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut  (djordje.pesut@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint8_t* dst, int dst_stride) {
  uint32_t alpha_mask = 0xffffffff;
  int i, j, temp0;

@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
  return (alpha_mask != 0xff);
 }

-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+                                  int inverse) {
  int x;
  const uint32_t c_00ffffff = 0x00ffffffu;
  const uint32_t c_ff000000 = 0xff000000u;
@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  }
 }

+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+                              const uint8_t* b, int len, int step,
+                              uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
 //------------------------------------------------------------------------------
 // Entry point

 extern void WebPInitAlphaProcessingMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPMultARGBRow = MultARGBRow;
+  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+  WebPPackRGB = PackRGB_MIPSdspR2;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/alpha_processing_neon.c
+++ b/src/dsp/alpha_processing_neon.c
@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_NEON)

-#include "./neon.h"
+#include "src/dsp/neon.h"

 //------------------------------------------------------------------------------

--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>

 //------------------------------------------------------------------------------

-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
+                              int width, int height,
+                              uint8_t* dst, int dst_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
  return (alpha_and != 0xff);
 }

-static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
-                                 int width, int height,
-                                 uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
+                                      int width, int height,
+                                      uint32_t* dst, int dst_stride) {
  int i, j;
  const __m128i zero = _mm_setzero_si128();
  const int limit = width & ~15;
@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
  }
 }

-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+                             int width, int height,
+                             uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
 #undef MULTIPLIER
 #undef PREMULTIPLY

+//------------------------------------------------------------------------------
+// Alpha detection
+
+static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
+  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  int i = 0;
+  for (; i + 16 <= length; i += 16) {
+    const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i < length; ++i) if (src[i] != 0xff) return 1;
+  return 0;
+}
+
+static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
+  const __m128i alpha_mask = _mm_set1_epi32(0xff);
+  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  int i = 0;
+  // We don't know if we can access the last 3 bytes after the last alpha
+  // value 'src[4 * length - 4]' (because we don't know if alpha is the first
+  // or the last byte of the quadruplet). Hence the '-3' protection below.
+  length = length * 4 - 3;   // size in bytes
+  for (; i + 64 <= length; i += 64) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
+    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+    const __m128i b2 = _mm_and_si128(a2, alpha_mask);
+    const __m128i b3 = _mm_and_si128(a3, alpha_mask);
+    const __m128i c0 = _mm_packs_epi32(b0, b1);
+    const __m128i c1 = _mm_packs_epi32(b2, b3);
+    const __m128i d  = _mm_packus_epi16(c0, c1);
+    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i + 32 <= length; i += 32) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+    const __m128i c  = _mm_packs_epi32(b0, b1);
+    const __m128i d  = _mm_packus_epi16(c, c);
+    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
+  return 0;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows

@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
    }
  }
  width -= x;
-  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+  if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }

 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
    }
  }
  width -= x;
-  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+  if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
 }

 //------------------------------------------------------------------------------
@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
  WebPMultARGBRow = MultARGBRow_SSE2;
  WebPMultRow = MultRow_SSE2;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
-  WebPExtractAlpha = ExtractAlpha;
+  WebPDispatchAlpha = DispatchAlpha_SSE2;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
+  WebPExtractAlpha = ExtractAlpha_SSE2;
+
+  WebPHasAlpha8b = HasAlpha8b_SSE2;
+  WebPHasAlpha32b = HasAlpha32b_SSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/alpha_processing_sse41.c
+++ b/src/dsp/alpha_processing_sse41.c
@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE41)

@ -19,9 +19,9 @@

 //------------------------------------------------------------------------------

-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
+                              int width, int height,
+                              uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
 extern void WebPInitAlphaProcessingSSE41(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
-  WebPExtractAlpha = ExtractAlpha;
+  WebPExtractAlpha = ExtractAlpha_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/argb.c
+++ b/src/dsp/argb.c
@ -1,68 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions.
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-  }
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int i, offset = 0;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
-    offset += step;
-  }
-}
-
-void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                    const uint8_t*, int, uint32_t*);
-void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                   int, int, uint32_t*);
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-extern void VP8EncDspARGBInitSSE2(void);
-
-static volatile VP8CPUInfo argb_last_cpuinfo_used =
-    (VP8CPUInfo)&argb_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
-  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8EncDspARGBInitSSE2();
-    }
-#endif
-#if defined(WEBP_USE_MIPS_DSP_R2)
-    if (VP8GetCPUInfo(kMIPSdspR2)) {
-      VP8EncDspARGBInitMIPSdspR2();
-    }
-#endif
-  }
-  argb_last_cpuinfo_used = VP8GetCPUInfo;
-}
--- a/src/dsp/argb_mips_dsp_r2.c
+++ b/src/dsp/argb_mips_dsp_r2.c
@ -1,110 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (mips version).
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_MIPS_DSP_R2)
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int temp0, temp1, temp2, temp3, offset;
-  const int rest = len & 1;
-  const uint32_t* const loop_end = out + len - rest;
-  const int step = 4;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int temp0, temp1, temp2, offset;
-  const int rest = len & 1;
-  const int a = 0xff;
-  const uint32_t* const loop_end = out + len - rest;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-}
-
-#else  // !WEBP_USE_MIPS_DSP_R2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
-
-#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/argb_sse2.c
+++ b/src/dsp/argb_sse2.c
@ -1,53 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (SSE2 version).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./dsp.h"
-#include "./lossless.h"
-
-#if defined(WEBP_USE_SSE2)
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <string.h>
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  (void)a;
-  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
-    assert(b == r + 2);
-    assert(a == r + 3);
-    VP8LConvertBGRAToRGBA((const uint32_t*)r, len, (uint8_t*)out);
-  } else {
-    assert(g == b + 1);
-    assert(r == b + 2);
-    assert(a == b + 3);
-    memcpy(out, b, len * 4);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitSSE2(void);
-extern void VP8LDspInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
-  VP8LDspInitSSE2();
-  VP8PackARGB = PackARGB;
-}
-
-#else  // !WEBP_USE_SSE2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
-
-#endif  // WEBP_USE_SSE2
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@ -9,8 +9,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/cost_enc.h"

 //------------------------------------------------------------------------------
 // Boolean-cost cost table
@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
 //------------------------------------------------------------------------------
 // Mode costs

-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  return cost;
 }

-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+                                VP8Residual* const res) {
  int n;
  res->last = -1;
  assert(res->first == 0 || coeffs[0] == 0);
@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;

-  VP8GetResidualCost = GetResidualCost;
-  VP8SetResidualCoeffs = SetResidualCoeffs;
+  VP8GetResidualCost = GetResidualCost_C;
+  VP8SetResidualCoeffs = SetResidualCoeffs_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS32)

-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"

-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
  int temp0, temp1;
  int v_reg, ctx_reg;
  int n = res->first;
@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  return cost;
 }

-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+                                     VP8Residual* const res) {
  const int16_t* p_coeffs = (int16_t*)coeffs;
  int temp0, temp1, temp2, n, n1;
  assert(res->first == 0 || coeffs[0] == 0);
@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 extern void VP8EncDspCostInitMIPS32(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
-  VP8GetResidualCost = GetResidualCost;
-  VP8SetResidualCoeffs = SetResidualCoeffs;
+  VP8GetResidualCost = GetResidualCost_MIPS32;
+  VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
 }

 #else  // !WEBP_USE_MIPS32
--- a/src/dsp/cost_mips_dsp_r2.c
+++ b/src/dsp/cost_mips_dsp_r2.c
@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"

-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
  int temp0, temp1;
  int v_reg, ctx_reg;
  int n = res->first;
@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
-  VP8GetResidualCost = GetResidualCost;
+  VP8GetResidualCost = GetResidualCost_MIPSdspR2;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@ -11,19 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>

-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------

-static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                                  VP8Residual* const res) {
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE2 to compare 16 values with a single instruction.
@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
  res->coeffs = coeffs;
 }

-static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
  uint8_t levels[16], ctxs[16];
  uint16_t abs_levels[16];
  int n = res->first;
@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
-  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
-  VP8GetResidualCost = GetResidualCostSSE2;
+  VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+  VP8GetResidualCost = GetResidualCost_SSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
@ -23,13 +23,11 @@
 #endif

 //------------------------------------------------------------------------------
-// x86/x86-64 micro-arch detection.
+// SSE2 detection.
 //

-// skip x86 specific code for WASM builds
-#if defined(WEBP_USE_WASM)
 // apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
-#elif (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "mov %%ebx, %%edi\n"
@ -65,10 +63,8 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
 #define GetCPUInfo __cpuid
 #endif

-// skip xgetbv definition for WASM builds
-#if defined(WEBP_USE_WASM)
 // NaCl has no support for xgetbv or the raw opcode.
-#elif !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
 static WEBP_INLINE uint64_t xgetbv(void) {
  const uint32_t ecx = 0;
  uint32_t eax, edx;
@ -98,19 +94,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif

-//------------------------------------------------------------------------------
-// Platform specific VP8CPUInfo functions.
-//
-
-// WASM needs to precede platform specific architecture checks as the defines
-// will still be present when building this target.
-#if defined(WEBP_USE_WASM)
-static int wasmCPUInfo(CPUFeature feature) {
-  if (feature != kWASM) return 0;
-  return 1;
-}
-VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
-#elif defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)

 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
@ -159,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
    return !!(cpu_info[2] & (1 << 0));
  }
  if (feature == kSlowSSSE3) {
-    if (is_intel && (cpu_info[2] & (1 << 0))) {   // SSSE3?
+    if (is_intel && (cpu_info[2] & (1 << 9))) {   // SSSE3?
      return CheckSlowModel(cpu_info[0]);
    }
    return 0;
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -11,9 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include <assert.h>
+
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------

@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 // Transforms (Paragraph 14.4)

 #define STORE(x, y, v) \
-  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+  dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))

 #define STORE2(y, dc, d, c) do {    \
  const int DC = (dc);              \
@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define MUL1(a) ((((a) * 20091) >> 16) + (a))
 #define MUL2(a) (((a) * 35468) >> 16)

-static void TransformOne(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformOne_C(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 }

 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
  const int c4 = MUL2(in[4]);
  const int d4 = MUL1(in[4]);
@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 #undef MUL2
 #undef STORE2

-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne_C(in, dst);
  if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOne_C(in + 16, dst + 4);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void TransformUV(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* in, uint8_t* dst) {
  VP8Transform(in + 0 * 16, dst, 1);
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

-static void TransformDC(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformDC_C(const int16_t* in, uint8_t* dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
    }
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // Paragraph 14.3

-static void TransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformWHT_C(const int16_t* in, int16_t* out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
    out += 64;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

 #define DST(x, y) dst[(x) + (y) * BPS]

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = VP8kclip1 - top[-1];
@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
    dst += BPS;
  }
 }
-static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4_C(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16_C(uint8_t* dst)  { TrueMotion(dst, 16); }

 //------------------------------------------------------------------------------
 // 16x16

-static void VE16(uint8_t* dst) {     // vertical
+static void VE16_C(uint8_t* dst) {     // vertical
  int j;
  for (j = 0; j < 16; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 16);
  }
 }

-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_C(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    memset(dst, dst[-1], 16);
@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  }
 }

-static void DC16(uint8_t* dst) {    // DC
+static void DC16_C(uint8_t* dst) {    // DC
  int DC = 16;
  int j;
  for (j = 0; j < 16; ++j) {
@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) {    // DC
  Put16(DC >> 5, dst);
 }

-static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop_C(uint8_t* dst) {   // DC with top samples not available
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft_C(uint8_t* dst) {  // DC with left samples not available
  int DC = 8;
  int i;
  for (i = 0; i < 16; ++i) {
@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+static void DC16NoTopLeft_C(uint8_t* dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];

@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

-static void VE4(uint8_t* dst) {    // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE4_C(uint8_t* dst) {    // vertical
  const uint8_t* top = dst - BPS;
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) {    // vertical
    memcpy(dst + i * BPS, vals, sizeof(vals));
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void HE4(uint8_t* dst) {    // horizontal
+static void HE4_C(uint8_t* dst) {    // horizontal
  const int A = dst[-1 - BPS];
  const int B = dst[-1];
  const int C = dst[-1 + BPS];
@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }

-static void DC4(uint8_t* dst) {   // DC
+#if !WEBP_NEON_OMIT_C_CODE
+static void DC4_C(uint8_t* dst) {   // DC
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) {   // DC
  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }

-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_C(uint8_t* dst) {   // Down-right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) {   // Down-right
                                      DST(3, 0) = AVG3(D, C, B);
 }

-static void LD4(uint8_t* dst) {   // Down-Left
+static void LD4_C(uint8_t* dst) {   // Down-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) {   // Down-Left
                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
                                      DST(3, 3) = AVG3(G, H, H);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void VR4(uint8_t* dst) {   // Vertical-Right
+static void VR4_C(uint8_t* dst) {   // Vertical-Right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) {   // Vertical-Right
  DST(3, 1) =             AVG3(B, C, D);
 }

-static void VL4(uint8_t* dst) {   // Vertical-Left
+static void VL4_C(uint8_t* dst) {   // Vertical-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) {   // Vertical-Left
              DST(3, 3) = AVG3(F, G, H);
 }

-static void HU4(uint8_t* dst) {   // Horizontal-Up
+static void HU4_C(uint8_t* dst) {   // Horizontal-Up
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) {   // Horizontal-Up
    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static void HD4(uint8_t* dst) {  // Horizontal-Down
+static void HD4_C(uint8_t* dst) {  // Horizontal-Down
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
 //------------------------------------------------------------------------------
 // Chroma

-static void VE8uv(uint8_t* dst) {    // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE8uv_C(uint8_t* dst) {    // vertical
  int j;
  for (j = 0; j < 8; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 8);
  }
 }

-static void HE8uv(uint8_t* dst) {    // horizontal
+static void HE8uv_C(uint8_t* dst) {    // horizontal
  int j;
  for (j = 0; j < 8; ++j) {
    memset(dst, dst[-1], 8);
@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  }
 }

-static void DC8uv(uint8_t* dst) {     // DC
+static void DC8uv_C(uint8_t* dst) {     // DC
  int dc0 = 8;
  int i;
  for (i = 0; i < 8; ++i) {
@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) {     // DC
  Put8x8uv(dc0 >> 4, dst);
 }

-static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft_C(uint8_t* dst) {   // DC with no left samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop_C(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+static void DC8uvNoTopLeft_C(uint8_t* dst) {    // DC with nothing
  Put8x8uv(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];

 //------------------------------------------------------------------------------
 // Edge filtering functions

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 // 4 pixels in, 2 pixels out
-static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
 }

 // 4 pixels in, 4 pixels out
-static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
  const int a1 = VP8ksclip2[(a + 4) >> 3];
@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
 }

 // 6 pixels in, 6 pixels out
-static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  p[ 2*step] = VP8kclip1[q2 - a3];
 }

-static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static WEBP_INLINE int needs_filter2(const uint8_t* p,
-                                     int step, int t, int it) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
+                                      int step, int t, int it) {
  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
  const int p0 = p[-step], q0 = p[0];
  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh2)) {
-      do_filter2(p + i, stride);
+    if (NeedsFilter_C(p + i, stride, thresh2)) {
+      DoFilter2_C(p + i, stride);
    }
  }
 }

-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh2)) {
-      do_filter2(p + i * stride, 1);
+    if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
+      DoFilter2_C(p + i * stride, 1);
    }
  }
 }

-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_C(p, stride, thresh);
  }
 }

-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_C(p, stride, thresh);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

-static WEBP_INLINE void FilterLoop26(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
+                                       int hstride, int vstride, int size,
+                                       int thresh, int ithresh,
+                                       int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh2, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
+    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+      if (Hev(p, hstride, hev_thresh)) {
+        DoFilter2_C(p, hstride);
      } else {
-        do_filter6(p, hstride);
+        DoFilter6_C(p, hstride);
      }
    }
    p += vstride;
  }
 }

-static WEBP_INLINE void FilterLoop24(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
+                                       int hstride, int vstride, int size,
+                                       int thresh, int ithresh,
+                                       int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh2, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
+    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+      if (Hev(p, hstride, hev_thresh)) {
+        DoFilter2_C(p, hstride);
      } else {
-        do_filter4(p, hstride);
+        DoFilter4_C(p, hstride);
      }
    }
    p += vstride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

+#if !WEBP_NEON_OMIT_C_CODE
 // on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+static void VFilter16_C(uint8_t* p, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
 }

-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+static void HFilter16_C(uint8_t* p, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
 }

 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_C(uint8_t* p, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter16i_C(uint8_t* p, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

+#if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE
+static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------

-static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
-                             int dst_stride) {
+static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
+                               int dst_stride) {
  int i, j;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) {
@ -700,7 +740,6 @@ extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);
-extern void VP8DspInitWASM(void);

 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
@ -710,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {

  VP8InitClipTables();

-  VP8TransformWHT = TransformWHT;
-  VP8Transform = TransformTwo;
-  VP8TransformUV = TransformUV;
-  VP8TransformDC = TransformDC;
-  VP8TransformDCUV = TransformDCUV;
-  VP8TransformAC3 = TransformAC3;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8TransformWHT = TransformWHT_C;
+  VP8Transform = TransformTwo_C;
+  VP8TransformDC = TransformDC_C;
+  VP8TransformAC3 = TransformAC3_C;
+#endif
+  VP8TransformUV = TransformUV_C;
+  VP8TransformDCUV = TransformDCUV_C;

-  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8VFilter16 = VFilter16_C;
+  VP8VFilter16i = VFilter16i_C;
+  VP8HFilter16 = HFilter16_C;
+  VP8VFilter8 = VFilter8_C;
+  VP8VFilter8i = VFilter8i_C;
+  VP8SimpleVFilter16 = SimpleVFilter16_C;
+  VP8SimpleHFilter16 = SimpleHFilter16_C;
+  VP8SimpleVFilter16i = SimpleVFilter16i_C;
+  VP8SimpleHFilter16i = SimpleHFilter16i_C;
+#endif

-  VP8PredLuma4[0] = DC4;
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[3] = HE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
-  VP8PredLuma4[6] = LD4;
-  VP8PredLuma4[7] = VL4;
-  VP8PredLuma4[8] = HD4;
-  VP8PredLuma4[9] = HU4;
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8HFilter16i = HFilter16i_C;
+  VP8HFilter8 = HFilter8_C;
+  VP8HFilter8i = HFilter8i_C;
+#endif

-  VP8PredLuma16[0] = DC16;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8PredLuma4[0] = DC4_C;
+  VP8PredLuma4[1] = TM4_C;
+  VP8PredLuma4[2] = VE4_C;
+  VP8PredLuma4[4] = RD4_C;
+  VP8PredLuma4[6] = LD4_C;
+#endif

-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
+  VP8PredLuma4[3] = HE4_C;
+  VP8PredLuma4[5] = VR4_C;
+  VP8PredLuma4[7] = VL4_C;
+  VP8PredLuma4[8] = HD4_C;
+  VP8PredLuma4[9] = HU4_C;

-  VP8DitherCombine8x8 = DitherCombine8x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8PredLuma16[0] = DC16_C;
+  VP8PredLuma16[1] = TM16_C;
+  VP8PredLuma16[2] = VE16_C;
+  VP8PredLuma16[3] = HE16_C;
+  VP8PredLuma16[4] = DC16NoTop_C;
+  VP8PredLuma16[5] = DC16NoLeft_C;
+  VP8PredLuma16[6] = DC16NoTopLeft_C;
+
+  VP8PredChroma8[0] = DC8uv_C;
+  VP8PredChroma8[1] = TM8uv_C;
+  VP8PredChroma8[2] = VE8uv_C;
+  VP8PredChroma8[3] = HE8uv_C;
+  VP8PredChroma8[4] = DC8uvNoTop_C;
+  VP8PredChroma8[5] = DC8uvNoLeft_C;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_C;
+#endif
+
+  VP8DitherCombine8x8 = DitherCombine8x8_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -771,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 #endif
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8DspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
@ -790,12 +836,59 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
    if (VP8GetCPUInfo(kMSA)) {
      VP8DspInitMSA();
    }
-#endif
-#if defined(WEBP_USE_WASM)
-    if (VP8GetCPUInfo(kWASM)) {
-      VP8DspInitWASM();
-    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8DspInitNEON();
+  }
+#endif
+
+  assert(VP8TransformWHT != NULL);
+  assert(VP8Transform != NULL);
+  assert(VP8TransformDC != NULL);
+  assert(VP8TransformAC3 != NULL);
+  assert(VP8TransformUV != NULL);
+  assert(VP8TransformDCUV != NULL);
+  assert(VP8VFilter16 != NULL);
+  assert(VP8HFilter16 != NULL);
+  assert(VP8VFilter8 != NULL);
+  assert(VP8HFilter8 != NULL);
+  assert(VP8VFilter16i != NULL);
+  assert(VP8HFilter16i != NULL);
+  assert(VP8VFilter8i != NULL);
+  assert(VP8HFilter8i != NULL);
+  assert(VP8SimpleVFilter16 != NULL);
+  assert(VP8SimpleHFilter16 != NULL);
+  assert(VP8SimpleVFilter16i != NULL);
+  assert(VP8SimpleHFilter16i != NULL);
+  assert(VP8PredLuma4[0] != NULL);
+  assert(VP8PredLuma4[1] != NULL);
+  assert(VP8PredLuma4[2] != NULL);
+  assert(VP8PredLuma4[3] != NULL);
+  assert(VP8PredLuma4[4] != NULL);
+  assert(VP8PredLuma4[5] != NULL);
+  assert(VP8PredLuma4[6] != NULL);
+  assert(VP8PredLuma4[7] != NULL);
+  assert(VP8PredLuma4[8] != NULL);
+  assert(VP8PredLuma4[9] != NULL);
+  assert(VP8PredLuma16[0] != NULL);
+  assert(VP8PredLuma16[1] != NULL);
+  assert(VP8PredLuma16[2] != NULL);
+  assert(VP8PredLuma16[3] != NULL);
+  assert(VP8PredLuma16[4] != NULL);
+  assert(VP8PredLuma16[5] != NULL);
+  assert(VP8PredLuma16[6] != NULL);
+  assert(VP8PredChroma8[0] != NULL);
+  assert(VP8PredChroma8[1] != NULL);
+  assert(VP8PredChroma8[2] != NULL);
+  assert(VP8PredChroma8[3] != NULL);
+  assert(VP8PredChroma8[4] != NULL);
+  assert(VP8PredChroma8[5] != NULL);
+  assert(VP8PredChroma8[6] != NULL);
+  assert(VP8DitherCombine8x8 != NULL);
+
  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/dec_clip_tables.c
+++ b/src/dsp/dec_clip_tables.c
@ -11,11 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

-#define USE_STATIC_TABLES     // undefine to have run-time table initialization
+// define to 0 to have run-time table initialization
+#if !defined(USE_STATIC_TABLES)
+#define USE_STATIC_TABLES 1   // ALTERNATE_CODE
+#endif

-#ifdef USE_STATIC_TABLES
+#if (USE_STATIC_TABLES == 1)

 static const uint8_t abs0[255 + 255 + 1] = {
  0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;

-#endif
+#endif    // USE_STATIC_TABLES

 const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
 const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];

 WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
-#if !defined(USE_STATIC_TABLES)
+#if (USE_STATIC_TABLES == 0)
  int i;
  if (!tables_ok) {
    for (i = -255; i <= 255; ++i) {
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS32)

-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@ -12,11 +12,11 @@
 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)


-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MSA)

-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"

 //------------------------------------------------------------------------------
 // Transforms
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -12,23 +12,25 @@
 // Author: somnath@google.com (Somnath Banerjee)
 //         cduvivier@google.com (Christian Duvivier)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE2)

 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
 // one it seems => disable it by default. Uncomment the following to enable:
-// #define USE_TRANSFORM_AC3
+#if !defined(USE_TRANSFORM_AC3)
+#define USE_TRANSFORM_AC3 0   // ALTERNATE_CODE
+#endif

 #include <emmintrin.h>
-#include "./common_sse2.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

-static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
  }
 }

-#if defined(USE_TRANSFORM_AC3)
+#if (USE_TRANSFORM_AC3 == 1)
 #define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  static const int kC1 = 20091 + (1 << 16);
@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
    _mm_subs_epu8((p), (q)))

 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
-static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
  const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
 }

 #define FLIP_SIGN_BIT2(a, b) {                                                 \
-  a = _mm_xor_si128(a, sign_bit);                                              \
-  b = _mm_xor_si128(b, sign_bit);                                              \
+  (a) = _mm_xor_si128(a, sign_bit);                                            \
+  (b) = _mm_xor_si128(b, sign_bit);                                            \
 }

 #define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
 }

 // input/output is uint8_t
-static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
-                                  const __m128i* const p0,
-                                  const __m128i* const q0,
-                                  const __m128i* const q1,
-                                  int hev_thresh, __m128i* const not_hev) {
+static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
+                                       const __m128i* const p0,
+                                       const __m128i* const q0,
+                                       const __m128i* const q1,
+                                       int hev_thresh, __m128i* const not_hev) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i t_1 = MM_ABS(*p1, *p0);
  const __m128i t_2 = MM_ABS(*q1, *q0);
@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
 }

 // input pixels are int8_t
-static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
-                                     const __m128i* const p0,
-                                     const __m128i* const q0,
-                                     const __m128i* const q1,
-                                     __m128i* const delta) {
+static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
+                                          const __m128i* const p0,
+                                          const __m128i* const q0,
+                                          const __m128i* const q1,
+                                          __m128i* const delta) {
  // beware of addition order, for saturation!
  const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
  const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
 }

 // input and output are int8_t
-static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
-                                       const __m128i* const fl) {
+static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
+                                            __m128i* const q0,
+                                            const __m128i* const fl) {
  const __m128i k3 = _mm_set1_epi8(3);
  const __m128i k4 = _mm_set1_epi8(4);
  __m128i v3 = _mm_adds_epi8(*fl, k3);
  __m128i v4 = _mm_adds_epi8(*fl, k4);

-  SignedShift8b(&v4);                  // v4 >> 3
-  SignedShift8b(&v3);                  // v3 >> 3
+  SignedShift8b_SSE2(&v4);             // v4 >> 3
+  SignedShift8b_SSE2(&v3);             // v3 >> 3
  *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
  *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
 }
@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
-static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
-                                      const __m128i* const a0_lo,
-                                      const __m128i* const a0_hi) {
+static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
+                                           const __m128i* const a0_lo,
+                                           const __m128i* const a0_hi) {
  const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
  const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
  const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
 }

 // input pixels are uint8_t
-static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
-                                    const __m128i* const p0,
-                                    const __m128i* const q0,
-                                    const __m128i* const q1,
-                                    int thresh, __m128i* const mask) {
+static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
+                                         const __m128i* const p0,
+                                         const __m128i* const q0,
+                                         const __m128i* const q1,
+                                         int thresh, __m128i* const mask) {
  const __m128i m_thresh = _mm_set1_epi8(thresh);
  const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
  const __m128i kFE = _mm_set1_epi8(0xFE);
@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
 // Edge filtering functions

 // Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
-                                  __m128i* const q0, __m128i* const q1,
-                                  int thresh) {
+static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
+                                       __m128i* const q0, __m128i* const q1,
+                                       int thresh) {
  __m128i a, mask;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
-  // convert p1/q1 to int8_t (for GetBaseDelta)
+  // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);

-  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);

  FLIP_SIGN_BIT2(*p0, *q0);
-  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
+  GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
-  DoSimpleFilter(p0, q0, &a);
+  DoSimpleFilter_SSE2(p0, q0, &a);
  FLIP_SIGN_BIT2(*p0, *q0);
 }

 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
-                                  __m128i* const q0, __m128i* const q1,
-                                  const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
+                                       __m128i* const q0, __m128i* const q1,
+                                       const __m128i* const mask,
+                                       int hev_thresh) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  const __m128i k64 = _mm_set1_epi8(64);
@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
  __m128i t1, t2, t3;

  // compute hev mask
-  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);

  // convert to signed values
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,

  t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
  t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
-  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
-  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  SignedShift8b_SSE2(&t2);           // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  SignedShift8b_SSE2(&t3);           // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
  *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
  FLIP_SIGN_BIT2(*p0, *q0);
@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
 }

 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
-                                  __m128i* const p0, __m128i* const q0,
-                                  __m128i* const q1, __m128i* const q2,
-                                  const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
+                                       __m128i* const p0, __m128i* const q0,
+                                       __m128i* const q1, __m128i* const q2,
+                                       const __m128i* const mask,
+                                       int hev_thresh) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  __m128i a, not_hev;

  // compute hev mask
-  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);

  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
-  GetBaseDelta(p1, p0, q0, q1, &a);
+  GetBaseDelta_SSE2(p1, p0, q0, q1, &a);

  { // do simple filter on pixels with hev
    const __m128i m = _mm_andnot_si128(not_hev, *mask);
    const __m128i f = _mm_and_si128(a, m);
-    DoSimpleFilter(p0, q0, &f);
+    DoSimpleFilter_SSE2(p0, q0, &f);
  }

  { // do strong filter on pixels with not hev
@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
    const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
    const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63

-    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
-    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
-    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
+    Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
  }
 }

 // reads 8 rows across a vertical edge.
-static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
-                                __m128i* const p, __m128i* const q) {
+static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
+                                     __m128i* const p, __m128i* const q) {
  // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
  // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
  const __m128i A0 = _mm_set_epi32(
@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
  *q = _mm_unpackhi_epi32(C0, C1);
 }

-static WEBP_INLINE void Load16x4(const uint8_t* const r0,
-                                 const uint8_t* const r8,
-                                 int stride,
-                                 __m128i* const p1, __m128i* const p0,
-                                 __m128i* const q0, __m128i* const q1) {
+static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
+                                      const uint8_t* const r8,
+                                      int stride,
+                                      __m128i* const p1, __m128i* const p0,
+                                      __m128i* const q0, __m128i* const q1) {
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-  Load8x4(r0, stride, p1, q0);
-  Load8x4(r8, stride, p0, q1);
+  Load8x4_SSE2(r0, stride, p1, q0);
+  Load8x4_SSE2(r8, stride, p0, q1);

  {
    // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
  }
 }

-static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
+                                      uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
 }

 // Transpose back and store
-static WEBP_INLINE void Store16x4(const __m128i* const p1,
-                                  const __m128i* const p0,
-                                  const __m128i* const q0,
-                                  const __m128i* const q1,
-                                  uint8_t* r0, uint8_t* r8,
-                                  int stride) {
+static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
+                                       const __m128i* const p0,
+                                       const __m128i* const q0,
+                                       const __m128i* const q1,
+                                       uint8_t* r0, uint8_t* r8,
+                                       int stride) {
  __m128i t1, p1_s, p0_s, q0_s, q1_s;

  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
  p1_s = _mm_unpacklo_epi16(t1, q1_s);
  q1_s = _mm_unpackhi_epi16(t1, q1_s);

-  Store4x4(&p0_s, r0, stride);
+  Store4x4_SSE2(&p0_s, r0, stride);
  r0 += 4 * stride;
-  Store4x4(&q0_s, r0, stride);
+  Store4x4_SSE2(&q0_s, r0, stride);

-  Store4x4(&p1_s, r8, stride);
+  Store4x4_SSE2(&p1_s, r8, stride);
  r8 += 4 * stride;
-  Store4x4(&q1_s, r8, stride);
+  Store4x4_SSE2(&q1_s, r8, stride);
 }

 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
  // Load
  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);

-  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);

  // Store
  _mm_storeu_si128((__m128i*)&p[-stride], p0);
  _mm_storeu_si128((__m128i*)&p[0], q0);
 }

-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
  __m128i p1, p0, q0, q1;

  p -= 2;  // beginning of p1

-  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
-  DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
+  Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
 }

-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_SSE2(p, stride, thresh);
  }
 }

-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_SSE2(p, stride, thresh);
  }
 }

@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 // Complex In-loop filtering (Paragraph 15.3)

 #define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
-  m = MM_ABS(p1, p0);                                                          \
-  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  (m) = MM_ABS(p1, p0);                                                        \
+  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)

 #define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
-  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  (m) = _mm_max_epu8(m, MM_ABS(p1, p0));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)

 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
-  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
-  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
-  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
-  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
+  (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]);                        \
+  (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]);                        \
+  (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]);                        \
+  (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]);                        \
 }

 #define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
  const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
-  p = _mm_unpacklo_epi64(U, V);                                                \
+  (p) = _mm_unpacklo_epi64(U, V);                                              \
 } while (0)

 #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
-  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
-  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
-  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
-  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
+  LOADUV_H_EDGE(e1, u, v, 0 * (stride));                                       \
+  LOADUV_H_EDGE(e2, u, v, 1 * (stride));                                       \
+  LOADUV_H_EDGE(e3, u, v, 2 * (stride));                                       \
+  LOADUV_H_EDGE(e4, u, v, 3 * (stride));                                       \
 }

 #define STOREUV(p, u, v, stride) {                                             \
-  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
-  p = _mm_srli_si128(p, 8);                                                    \
-  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
+  _mm_storel_epi64((__m128i*)&(u)[(stride)], p);                               \
+  (p) = _mm_srli_si128(p, 8);                                                  \
+  _mm_storel_epi64((__m128i*)&(v)[(stride)], p);                               \
 }

-static WEBP_INLINE void ComplexMask(const __m128i* const p1,
-                                    const __m128i* const p0,
-                                    const __m128i* const q0,
-                                    const __m128i* const q1,
-                                    int thresh, int ithresh,
-                                    __m128i* const mask) {
+static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
+                                         const __m128i* const p0,
+                                         const __m128i* const q0,
+                                         const __m128i* const q1,
+                                         int thresh, int ithresh,
+                                         __m128i* const mask) {
  const __m128i it = _mm_set1_epi8(ithresh);
  const __m128i diff = _mm_subs_epu8(*mask, it);
  const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
  __m128i filter_mask;
-  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
  *mask = _mm_and_si128(thresh_mask, filter_mask);
 }

 // on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_SSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
  __m128i t1;
  __m128i mask;
  __m128i p2, p1, p0, q0, q1, q2;
@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);

-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

  // Store
  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
  _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
 }

-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_SSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

  uint8_t* const b = p - 4;
-  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
  MAX_DIFF1(p3, p2, p1, p0, mask);

-  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
+  Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
  MAX_DIFF2(q3, q2, q1, q0, mask);

-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

-  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
-  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
+  Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+  Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
 }

 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_SSE2(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
  int k;
  __m128i p3, p2, p1, p0;   // loop invariants

@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,

    // p3 and p2 are not just temporary variables here: they will be
    // re-used for next span. And q2/q3 will become p1/p0 accordingly.
-    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);

    // Store
    _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
  }
 }

-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_SSE2(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
  int k;
  __m128i p3, p2, p1, p0;   // loop invariants

-  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
+  Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue

  for (k = 3; k > 0; --k) {
    __m128i mask, tmp1, tmp2;
@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
    p += 4;  // beginning of q0 (and next span)

    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
-    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+    Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);

-    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);

-    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+    Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);

    // rotate samples
    p1 = tmp1;
@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
 }

 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, p2, p1, p0, q0, q1, q2;

@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);

-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

  // Store
  STOREUV(p2, u, v, -3 * stride);
@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q2, u, v, 2 * stride);
 }

-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

  uint8_t* const tu = u - 4;
  uint8_t* const tv = v - 4;
-  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
  MAX_DIFF1(p3, p2, p1, p0, mask);

-  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
+  Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
  MAX_DIFF2(q3, q2, q1, q0, mask);

-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

-  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
-  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
+  Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
+  Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
 }

-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;

@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
  MAX_DIFF2(t2, t1, q1, q0, mask);

-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);

  // Store
  STOREUV(p1, u, v, -2 * stride);
@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q1, u, v, 1 * stride);
 }

-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
-  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
  MAX_DIFF1(t2, t1, p1, p0, mask);

  u += 4;  // beginning of q0
  v += 4;
-  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
  MAX_DIFF2(t2, t1, q1, q0, mask);

-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);

  u -= 2;  // beginning of p1
  v -= 2;
-  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
+  Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
 }

 //------------------------------------------------------------------------------
@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1

-static void VE4(uint8_t* dst) {    // vertical
+static void VE4_SSE2(uint8_t* dst) {    // vertical
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) {    // vertical
  }
 }

-static void LD4(uint8_t* dst) {   // Down-Left
+static void LD4_SSE2(uint8_t* dst) {   // Down-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }

-static void VR4(uint8_t* dst) {   // Vertical-Right
+static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
  const __m128i one = _mm_set1_epi8(1);
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) {   // Vertical-Right
  DST(0, 3) = AVG3(K, J, I);
 }

-static void VL4(uint8_t* dst) {   // Vertical-Left
+static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) {   // Vertical-Left
  DST(3, 3) = (extra_out >> 8) & 0xff;
 }

-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_SSE2(uint8_t* dst) {   // Down-right
  const __m128i one = _mm_set1_epi8(1);
  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
  const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) {   // Down-right
 //------------------------------------------------------------------------------
 // Luma 16x16

-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const __m128i zero = _mm_setzero_si128();
  int y;
@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  }
 }

-static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4_SSE2(uint8_t* dst)   { TrueMotion_SSE2(dst, 4); }
+static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
+static void TM16_SSE2(uint8_t* dst)  { TrueMotion_SSE2(dst, 16); }

-static void VE16(uint8_t* dst) {
+static void VE16_SSE2(uint8_t* dst) {
  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
  int j;
  for (j = 0; j < 16; ++j) {
@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
  }
 }

-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_SSE2(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    const __m128i values = _mm_set1_epi8(dst[-1]);
@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) {     // horizontal
  }
 }

-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 16; ++j) {
@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
  }
 }

-static void DC16(uint8_t* dst) {    // DC
+static void DC16_SSE2(uint8_t* dst) {  // DC
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) {    // DC
  }
  {
    const int DC = _mm_cvtsi128_si32(sum) + left + 16;
-    Put16(DC >> 5, dst);
+    Put16_SSE2(DC >> 5, dst);
  }
 }

-static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop_SSE2(uint8_t* dst) {  // DC with top samples unavailable
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
    DC += dst[-1 + j * BPS];
  }
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }

-static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft_SSE2(uint8_t* dst) {  // DC with left samples unavailable
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
  const int DC = _mm_cvtsi128_si32(sum) + 8;
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }

-static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
-  Put16(0x80, dst);
+static void DC16NoTopLeft_SSE2(uint8_t* dst) {  // DC with no top & left samples
+  Put16_SSE2(0x80, dst);
 }

 //------------------------------------------------------------------------------
 // Chroma

-static void VE8uv(uint8_t* dst) {    // vertical
+static void VE8uv_SSE2(uint8_t* dst) {    // vertical
  int j;
  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
  for (j = 0; j < 8; ++j) {
@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) {    // vertical
  }
 }

-static void HE8uv(uint8_t* dst) {    // horizontal
-  int j;
-  for (j = 0; j < 8; ++j) {
-    const __m128i values = _mm_set1_epi8(dst[-1]);
-    _mm_storel_epi64((__m128i*)dst, values);
-    dst += BPS;
-  }
-}
-
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 8; ++j) {
@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
  }
 }

-static void DC8uv(uint8_t* dst) {     // DC
+static void DC8uv_SSE2(uint8_t* dst) {     // DC
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
  const __m128i sum = _mm_sad_epu8(top, zero);
@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) {     // DC
  }
  {
    const int DC = _mm_cvtsi128_si32(sum) + left + 8;
-    Put8x8uv(DC >> 4, dst);
+    Put8x8uv_SSE2(DC >> 4, dst);
  }
 }

-static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft_SSE2(uint8_t* dst) {   // DC with no left samples
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
  const __m128i sum = _mm_sad_epu8(top, zero);
  const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv(DC >> 3, dst);
+  Put8x8uv_SSE2(DC >> 3, dst);
 }

-static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop_SSE2(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
-  Put8x8uv(dc0 >> 3, dst);
+  Put8x8uv_SSE2(dc0 >> 3, dst);
 }

-static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
-  Put8x8uv(0x80, dst);
+static void DC8uvNoTopLeft_SSE2(uint8_t* dst) {    // DC with nothing
+  Put8x8uv_SSE2(0x80, dst);
 }

 //------------------------------------------------------------------------------
@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
 extern void VP8DspInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
-  VP8Transform = Transform;
-#if defined(USE_TRANSFORM_AC3)
-  VP8TransformAC3 = TransformAC3;
+  VP8Transform = Transform_SSE2;
+#if (USE_TRANSFORM_AC3 == 1)
+  VP8TransformAC3 = TransformAC3_SSE2;
 #endif

-  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
+  VP8VFilter16 = VFilter16_SSE2;
+  VP8HFilter16 = HFilter16_SSE2;
+  VP8VFilter8 = VFilter8_SSE2;
+  VP8HFilter8 = HFilter8_SSE2;
+  VP8VFilter16i = VFilter16i_SSE2;
+  VP8HFilter16i = HFilter16i_SSE2;
+  VP8VFilter8i = VFilter8i_SSE2;
+  VP8HFilter8i = HFilter8i_SSE2;

-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
+  VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
+  VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
+  VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
+  VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;

-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
-  VP8PredLuma4[6] = LD4;
-  VP8PredLuma4[7] = VL4;
+  VP8PredLuma4[1] = TM4_SSE2;
+  VP8PredLuma4[2] = VE4_SSE2;
+  VP8PredLuma4[4] = RD4_SSE2;
+  VP8PredLuma4[5] = VR4_SSE2;
+  VP8PredLuma4[6] = LD4_SSE2;
+  VP8PredLuma4[7] = VL4_SSE2;

-  VP8PredLuma16[0] = DC16;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
+  VP8PredLuma16[0] = DC16_SSE2;
+  VP8PredLuma16[1] = TM16_SSE2;
+  VP8PredLuma16[2] = VE16_SSE2;
+  VP8PredLuma16[3] = HE16_SSE2;
+  VP8PredLuma16[4] = DC16NoTop_SSE2;
+  VP8PredLuma16[5] = DC16NoLeft_SSE2;
+  VP8PredLuma16[6] = DC16NoTopLeft_SSE2;

-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
+  VP8PredChroma8[0] = DC8uv_SSE2;
+  VP8PredChroma8[1] = TM8uv_SSE2;
+  VP8PredChroma8[2] = VE8uv_SSE2;
+  VP8PredChroma8[4] = DC8uvNoTop_SSE2;
+  VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/dec_sse41.c
+++ b/src/dsp/dec_sse41.c
@ -11,15 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE41)

 #include <smmintrin.h>
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"

-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_SSE41(uint8_t* dst) {     // horizontal
  int j;
  const __m128i kShuffle3 = _mm_set1_epi8(3);
  for (j = 16; j > 0; --j) {
@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) {     // horizontal
 extern void VP8DspInitSSE41(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
-  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[3] = HE16_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/dec_wasm.c
+++ b/src/dsp/dec_wasm.c
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -15,10 +15,10 @@
 #define WEBP_DSP_DSP_H_

 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif

-#include "../webp/types.h"
+#include "src/webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@ -51,9 +51,8 @@ extern "C" {
 # define __has_builtin(x) 0
 #endif

-// For now, none of the optimizations below are available in emscripten.
-// WebAssembly overrides native optimizations.
-#if !(defined(EMSCRIPTEN) || defined(WEBP_USE_WASM))
+// for now, none of the optimizations below are available in emscripten
+#if !defined(EMSCRIPTEN)

 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
@ -105,7 +104,7 @@ extern "C" {
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
 #define WEBP_USE_MIPS_DSP_R2
 #endif
 #endif
@ -117,6 +116,22 @@ extern "C" {

 #endif  /* EMSCRIPTEN */

+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@ -146,6 +161,11 @@ extern "C" {
 #endif
 #endif

+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
 typedef enum {
  kSSE2,
  kSSE3,
@ -156,12 +176,11 @@ typedef enum {
  kNEON,
  kMIPS32,
  kMIPSdspR2,
-  kMSA,
-  kWASM
+  kMSA
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;

 //------------------------------------------------------------------------------
 // Init stub generator
@ -289,6 +308,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
                                        int xo, int yo,  // center position
                                        int W, int H);   // plane dimension

+#if !defined(WEBP_REDUCE_SIZE)
 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
 typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@ -296,10 +316,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,

 extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
 extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping
+#endif

+#if !defined(WEBP_DISABLE_STATS)
 typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
                                         const uint8_t* src2, int len);
 extern VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif

 // must be called before using any of the above directly
 void VP8SSIMDspInit(void);
@ -480,12 +503,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;

 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
-                                         const uint8_t* src);
-extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
-                                         const uint8_t* src);
-extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
-extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);

 // Main entry calls:
 extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@ -551,25 +574,22 @@ void WebPMultRows(uint8_t* ptr, int stride,
                  int width, int num_rows, int inverse);

 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
-                  int width, int inverse);
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+                   int width, int inverse);
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                           int len, int step, uint32_t* out);
+
+// This function returns true if src[i] contains a value different from 0xff.
+extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+// This function returns true if src[4*i] contains a value different from 0xff.
+extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);

 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);

-// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
-                           const uint8_t* g, const uint8_t* b, int len,
-                           uint32_t* out);
-
-// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                          int len, int step, uint32_t* out);
-
-// To be called first before using the above.
-void VP8EncDspARGBInit(void);
-
 //------------------------------------------------------------------------------
 // Filter functions

--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -14,16 +14,18 @@
 #include <assert.h>
 #include <stdlib.h>  // for abs()

-#include "./dsp.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"

 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int clip_max(int v, int max) {
  return (v > max) ? max : v;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
  histo->last_non_zero = last_non_zero;
 }

-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+                               int start_block, int end_block,
+                               VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
  }
  VP8SetHistogramData(distribution, histo);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // run-time tables (~4k)
@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

+#if !WEBP_NEON_OMIT_C_CODE
+
 #define STORE(x, y, v) \
  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))

@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  }
 }

-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                         int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+                          int16_t* out) {
  VP8FTransform(src, ref, out);
  VP8FTransform(src + 4, ref + 4, out + 16);
 }

-static void FTransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
  // input is 12b signed
  int32_t tmp[16];
  int i;
@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
    out[12 + i] = b3 >> 1;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef MUL
 #undef STORE
@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top) {
  // U block
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_C(uint8_t* dst,
+                           const uint8_t* left, const uint8_t* top) {
  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  VerticalPred(I16VE16 + dst, top, 16);
  HorizontalPred(I16HE16 + dst, left, 16);
@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                              int w, int h) {
  int count = 0;
@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
  return count;
 }

-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 4, 4);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
  int k, x, y;
  for (k = 0; k < 4; ++k) {
    uint32_t avg = 0;
@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.

+#if !WEBP_NEON_OMIT_C_CODE
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  return sum;
 }

-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
  return abs(sum2 - sum1) >> 5;
 }

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_C(a + x + y, b + x + y, w);
    }
  }
  return D;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Quantization
@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
 };

 // Simple quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+                           const VP8Matrix* const mtx) {
  int last = -1;
  int n;
  for (n = 0; n < 16; ++n) {
@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+                             const VP8Matrix* const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Block copy
@ -682,11 +698,11 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  }
 }

-static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
  Copy(src, dst, 4, 4);
 }

-static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
  Copy(src, dst, 16, 8);
 }

@ -734,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  InitTables();

  // default C implementations
-  VP8CollectHistogram = CollectHistogram;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
-  VP8Copy4x4 = Copy4x4;
-  VP8Copy16x8 = Copy16x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8ITransform = ITransform_C;
+  VP8FTransform = FTransform_C;
+  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
+  VP8EncPredLuma4 = Intra4Preds_C;
+  VP8EncPredLuma16 = Intra16Preds_C;
+  VP8EncPredChroma8 = IntraChromaPreds_C;
+  VP8Mean16x4 = Mean16x4_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+  VP8Copy4x4 = Copy4x4_C;
+  VP8Copy16x8 = Copy16x8_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -772,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
      VP8EncDspInitAVX2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8EncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
@ -793,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+
  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/enc_avx2.c
+++ b/src/dsp/enc_avx2.c
@ -9,7 +9,7 @@
 //
 // AVX2 version of speed-critical encoding functions.

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_AVX2)

--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -13,13 +13,13 @@
 //            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 //            Slobodan Prijic  (slobodan.prijic@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS32)

-#include "./mips_macro.h"
-#include "../enc/vp8i_enc.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@ -113,8 +113,9 @@ static const int kC2 = 35468;
  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"

 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+                                             const int16_t* in,
+                                             uint8_t* dst) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  );
 }

-static void ITransform(const uint8_t* ref, const int16_t* in,
-                       uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+                              uint8_t* dst, int do_two) {
+  ITransformOne_MIPS32(ref, in, dst);
  if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
+    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
  }
 }

@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
  "sh           %[level],       " #N "(%[pout])                     \n\t"

-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int sign, coeff, level, i;
  int max_level = MAX_LEVEL;
@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return 0;
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+                                  const VP8Matrix* const mtx) {
  int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }

@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
  "msub   %[temp6],  %[temp0]                \n\t"                \
  "msub   %[temp7],  %[temp1]                \n\t"

-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
  int tmp[32];
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                             const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+                              int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
  int temp17, temp18, temp19, temp20;
@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)

-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 extern void VP8EncDspInitMIPS32(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  VP8ITransform = ITransform_MIPS32;
+  VP8FTransform = FTransform_MIPS32;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+  VP8TDisto4x4 = Disto4x4_MIPS32;
+  VP8TDisto16x16 = Disto16x16_MIPS32;
+
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MIPS32;
+  VP8SSE8x8 = SSE8x8_MIPS32;
+  VP8SSE16x8 = SSE16x8_MIPS32;
+  VP8SSE4x4 = SSE4x4_MIPS32;
 #endif
 }

--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@ -12,13 +12,13 @@
 // Author(s): Darko Laus (darko.laus@imgtec.com)
 //            Mirko Raus (mirko.raus@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "./mips_macro.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@ -141,7 +141,8 @@ static const int kC2 = 35468;
  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"

-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+                                 int16_t* out) {
  const int c2217 = 2217;
  const int c5352 = 5352;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  );
 }

-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst, int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+                              const uint16_t* const w) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;

@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
  return abs(temp3 - temp17) >> 5;
 }

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+                                const uint8_t* const b,
+                                const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
  // U block
  DCMode8(C8DC8 + dst, left, top);
  VerticalPred8(C8VE8 + dst, top);
@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+                                   const uint8_t* left, const uint8_t* top) {
  DCMode16(I16DC16 + dst, left, top);
  VerticalPred16(I16VE16 + dst, top);
  HorizontalPred16(I16HE16 + dst, left);
@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  GET_SSE_INNER(C)                        \
  GET_SSE_INNER(D)

-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
 "3:                                                          \n\t"

-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+                                   const VP8Matrix* const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
  int sign, coeff, level;
  int max_level = MAX_LEVEL;
@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (ret != 0);
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+                                     const VP8Matrix* const mtx) {
  int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }

@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"

-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;

@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
  "sw         %[temp8],  0(%[temp3])                   \n\t"

-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+                                       int start_block, int end_block,
+                                       VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 extern void VP8EncDspInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
-  VP8FTransform = FTransform;
-  VP8ITransform = ITransform;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8EncPredLuma4 = Intra4Preds;
+  VP8FTransform = FTransform_MIPSdspR2;
+  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+  VP8ITransform = ITransform_MIPSdspR2;
+
+  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MIPSdspR2;
+  VP8SSE8x8 = SSE8x8_MIPSdspR2;
+  VP8SSE16x8 = SSE16x8_MIPSdspR2;
+  VP8SSE4x4 = SSE4x4_MIPSdspR2;
 #endif
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8FTransformWHT = FTransformWHT;
-  VP8CollectHistogram = CollectHistogram;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@ -11,13 +11,13 @@
 //
 // Author:  Prashant Patil   (prashant.patil@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MSA)

 #include <stdlib.h>
-#include "./msa_macro.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/msa_macro.h"
+#include "src/enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Transforms
@ -69,15 +69,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }

-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  SD4(out0, out1, out2, out3, out, 8);
 }

-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
  v8i16 in0 = { 0 };
  v8i16 in1 = { 0 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
@ -167,7 +168,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
  ST_SH2(out0, out1, out, 8);
 }

-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
  v16i8 src0 = { 0 };
@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  return sum;
 }

-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int sum1 = TTransform(a, w);
-  const int sum2 = TTransform(b, w);
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int sum1 = TTransform_MSA(a, w);
+  const int sum2 = TTransform_MSA(b, w);
  return abs(sum2 - sum1) >> 5;
 }

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MSA(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 //------------------------------------------------------------------------------
 // Histogram

-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
@ -430,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG2

-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -547,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
  STORE16x16(out, dst);
 }

-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MSA(uint8_t* dst,
+                             const uint8_t* left, const uint8_t* top) {
  DCMode16x16(I16DC16 + dst, left, top);
  VerticalPred16x16(I16VE16 + dst, top);
  HorizontalPred16x16(I16HE16 + dst, left);
@ -669,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
  STORE8x8(out, dst);
 }

-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
  // U block
  DCMode8x8(C8DC8 + dst, left, top);
  VerticalPred8x8(C8VE8 + dst, top);
@ -711,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)

-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -738,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -757,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -777,7 +778,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
  uint32_t sum = 0;
  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@ -799,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 //------------------------------------------------------------------------------
 // Quantization

-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+                             const VP8Matrix* const mtx) {
  int sum;
  v8i16 in0, in1, sh0, sh1, out0, out1;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -852,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (sum > 0);
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+                               const VP8Matrix* const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -866,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitMSA(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransformWHT = FTransformWHT;
+  VP8ITransform = ITransform_MSA;
+  VP8FTransform = FTransform_MSA;
+  VP8FTransformWHT = FTransformWHT_MSA;

-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8CollectHistogram = CollectHistogram;
+  VP8TDisto4x4 = Disto4x4_MSA;
+  VP8TDisto16x16 = Disto16x16_MSA;
+  VP8CollectHistogram = CollectHistogram_MSA;

-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8EncPredLuma4 = Intra4Preds_MSA;
+  VP8EncPredLuma16 = Intra16Preds_MSA;
+  VP8EncPredChroma8 = IntraChromaPreds_MSA;

-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MSA;
+  VP8SSE16x8 = SSE16x8_MSA;
+  VP8SSE8x8 = SSE8x8_MSA;
+  VP8SSE4x4 = SSE4x4_MSA;

-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
+  VP8EncQuantizeBlock = QuantizeBlock_MSA;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
 }

 #else  // !WEBP_USE_MSA
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -11,14 +11,14 @@
 //
 // adapted from libvpx (http://www.webmproject.org/code/)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_NEON)

 #include <assert.h>

-#include "./neon.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/neon.h"
+#include "src/enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@ -37,15 +37,15 @@ static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 #if defined(WEBP_USE_INTRINSICS)

 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
 }

 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
-                                            const int16x8_t dst01,
-                                            const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
  // Unsigned saturate to 8b.
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }

-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
-                               const uint8_t* const ref, uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    const uint8_t* const ref,
+                                    uint8_t* const dst) {
  uint32x2_t dst01 = vdup_n_u32(0);
  uint32x2_t dst23 = vdup_n_u32(0);

@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,

  {
    // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);

    // Descale with rounding.
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    // Add the inverse transform.
-    SaturateAndStore4x4(dst, out01, out23);
+    SaturateAndStore4x4_NEON(dst, out01, out23);
  }
 }

-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
-                                     int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }

-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
  // {rows} = in0 | in4
  //          in8 | in12
  // B1 = in4 | in12
@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
-  Transpose8x2(E0, E1, rows);
+  Transpose8x2_NEON(E0, E1, rows);
 }

-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
-  TransformPass(&rows);
-  TransformPass(&rows);
-  Add4x4(rows.val[0], rows.val[1], ref, dst);
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
 }

 #else

-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
  const int kBPS = BPS;
  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };

@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,

 #endif    // WEBP_USE_INTRINSICS

-static void ITransform(const uint8_t* ref,
-                       const int16_t* in, uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
+static void ITransform_NEON(const uint8_t* ref,
+                            const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne_NEON(ref, in, dst);
  if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
+    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
  }
 }

 // Load all 4x4 pixels into a single uint8x16_t variable.
-static uint8x16_t Load4x4(const uint8_t* src) {
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
  uint32x4_t out = vdupq_n_u32(0);
  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {

 #if defined(WEBP_USE_INTRINSICS)

-static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
-                                         const int16x4_t C, const int16x4_t D,
-                                         int16x8_t* const out01,
-                                         int16x8_t* const out32) {
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+                                              const int16x4_t B,
+                                              const int16x4_t C,
+                                              const int16x4_t D,
+                                              int16x8_t* const out01,
+                                              int16x8_t* const out32) {
  const int16x4x2_t AB = vtrn_s16(A, B);
  const int16x4x2_t CD = vtrn_s16(C, D);
  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
                   vreinterpret_s64_s32(tmp02.val[1])));
 }

-static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
-                                         const uint8x8_t b) {
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+                                              const uint8x8_t b) {
  return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }

-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
  {
-    const uint8x16_t S0 = Load4x4(src);
-    const uint8x16_t R0 = Load4x4(ref);
-    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
-    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
+    const uint8x16_t S0 = Load4x4_NEON(src);
+    const uint8x16_t R0 = Load4x4_NEON(ref);
+    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
    const int16x4_t D0 = vget_low_s16(D0D1);
    const int16x4_t D1 = vget_high_s16(D0D1);
    const int16x4_t D2 = vget_low_s16(D2D3);
    const int16x4_t D3 = vget_high_s16(D2D3);
-    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
+    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
  }
  {    // 1rst pass
    const int32x4_t kCst937 = vdupq_n_s32(937);
@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
-    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
  }
  {    // 2nd pass
    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
  51000, 51000, 51000, 51000
 };

-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
  const int kBPS = BPS;
  const uint8_t* src_ptr = src;
  const uint8_t* ref_ptr = ref;
@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
  src += stride;                                    \
 } while (0)

-static void FTransformWHT(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
  const int stride = 16;
  const int16x4_t zero = vdup_n_s16(0);
  int32x4x4_t tmp0;
@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
    tmp0.val[3] = vsubq_s32(a0, a1);
  }
  {
-    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
+    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
    // a0 = tmp[0 + i] + tmp[ 8 + i]
    // a1 = tmp[4 + i] + tmp[12 + i]
    // a2 = tmp[4 + i] - tmp[12 + i]
@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
 // a 26ae, b 26ae
 // a 37bf, b 37bf
 //
-static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
  return q4_in;
 }

-static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+    const int16x8x4_t q4_in) {
  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
  return q4_out;
 }

-static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
                                                        q4_in.val[2]));
  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
  return q4_out;
 }

-static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
  int16x4x4_t d4_w;
@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
  return d4_w;
 }

-static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
-                                      const int16x4x4_t d4_w) {
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+                                           const int16x4x4_t d4_w) {
  int32x2_t d_sum;
  // sum += w[ 0] * abs(b0);
  // sum += w[ 4] * abs(b1);
@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
    // Vertical pass first to avoid a transpose (vertical and horizontal passes
    // are commutative because w/kWeightY is symmetric) and subsequent
    // transpose.
-    const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
-    const int16x4x4_t d4_w = DistoLoadW(w);
+    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
    // horizontal pass
-    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
-    const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
-    int32x2_t d_sum = DistoSum(q4_h, d4_w);
+    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);

    // abs(sum2 - sum1) >> 5
    d_sum = vabs_s32(d_sum);
@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 }
 #undef LOAD_LANE_32b

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_NEON(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

 //------------------------------------------------------------------------------

-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int16_t out[16];
-    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    {
      int k;
      const int16x8_t a0 = vld1q_s16(out + 0);
@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,

 //------------------------------------------------------------------------------

-static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
-                                        const uint8_t* const b,
-                                        uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+                                             const uint8_t* const b,
+                                             uint32x4_t* const sum) {
  const uint8x16_t a0 = vld1q_u8(a);
  const uint8x16_t b0 = vld1q_u8(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
 }

 // Horizontal sum of all four uint32_t values in 'sum'.
-static int SumToInt(uint32x4_t sum) {
+static int SumToInt_NEON(uint32x4_t sum) {
  const uint64x2_t sum2 = vpaddlq_u32(sum);
  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
  return (int)sum3;
@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 16; ++y) {
-    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
  }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }

 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
-    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
  }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }

 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
    sum = vpadalq_u16(sum, prod);
  }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }

 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
-  const uint8x16_t a0 = Load4x4(a);
-  const uint8x16_t b0 = Load4x4(b);
+  const uint8x16_t a0 = Load4x4_NEON(a);
+  const uint8x16_t b0 = Load4x4_NEON(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
                                    vget_low_u8(abs_diff));
@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
  /* pair-wise adds and widen */
  const uint32x4_t sum1 = vpaddlq_u16(prod1);
  const uint32x4_t sum2 = vpaddlq_u16(prod2);
-  return SumToInt(vaddq_u32(sum1, sum2));
+  return SumToInt_NEON(vaddq_u32(sum1, sum2));
 }

 //------------------------------------------------------------------------------
@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)

-static int16x8_t Quantize(int16_t* const in,
-                          const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize_NEON(int16_t* const in,
+                               const VP8Matrix* const mtx, int offset) {
  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
  { 14, 15, 22, 23, 28, 29, 30, 31 }
 };

-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  const int16x8_t out0 = Quantize(in, mtx, 0);
-  const int16x8_t out1 = Quantize(in, mtx, 8);
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
  uint8x8x4_t shuffles;
  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
  // non-standard versions there.
@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return 0;
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
  int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }

@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitNEON(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
+  VP8ITransform = ITransform_NEON;
+  VP8FTransform = FTransform_NEON;

-  VP8FTransformWHT = FTransformWHT;
+  VP8FTransformWHT = FTransformWHT_NEON;

-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8CollectHistogram = CollectHistogram;
+  VP8TDisto4x4 = Disto4x4_NEON;
+  VP8TDisto16x16 = Disto16x16_NEON;
+  VP8CollectHistogram = CollectHistogram_NEON;

  VP8SSE16x16 = SSE16x16_NEON;
  VP8SSE16x8 = SSE16x8_NEON;
@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
  VP8SSE4x4 = SSE4x4_NEON;

 #if !defined(WORK_AROUND_GCC)
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlock = QuantizeBlock_NEON;
+  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
 #endif
 }

--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -11,23 +11,23 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>

-#include "./common_sse2.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

 // Does one or two inverse transforms.
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                            int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  }
 }

-static void FTransformPass1(const __m128i* const in01,
-                            const __m128i* const in23,
-                            __m128i* const out01,
-                            __m128i* const out32) {
+static void FTransformPass1_SSE2(const __m128i* const in01,
+                                 const __m128i* const in23,
+                                 __m128i* const out01,
+                                 __m128i* const out32) {
  const __m128i k937 = _mm_set1_epi32(937);
  const __m128i k1812 = _mm_set1_epi32(1812);

@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
 }

-static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
-                            int16_t* out) {
+static void FTransformPass2_SSE2(const __m128i* const v01,
+                                 const __m128i* const v32,
+                                 int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }

-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  // Load src.
  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  __m128i v01, v32;

  // First pass
-  FTransformPass1(&row01, &row23, &v01, &v32);
+  FTransformPass1_SSE2(&row01, &row23, &v01, &v32);

  // Second pass
-  FTransformPass2(&v01, &v32, out);
+  FTransformPass2_SSE2(&v01, &v32, out);
 }

-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+                             int16_t* out) {
  const __m128i zero = _mm_setzero_si128();

  // Load src and convert to 16b.
@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  __m128i v01h, v32h;

  // First pass
-  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
-  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
+  FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);

  // Second pass
-  FTransformPass2(&v01l, &v32l, out + 0);
-  FTransformPass2(&v01h, &v32h, out + 16);
+  FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
 }

-static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
  *out = _mm_madd_epi16(D, kMult);
 }

-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
  // Input is 12b signed.
  __m128i row0, row1, row2, row3;
  // Rows are 14b signed.
-  FTransformWHTRow(in + 0 * 64, &row0);
-  FTransformWHTRow(in + 1 * 64, &row1);
-  FTransformWHTRow(in + 2 * 64, &row2);
-  FTransformWHTRow(in + 3 * 64, &row3);
+  FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+  FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+  FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+  FTransformWHTRow_SSE2(in + 3 * 64, &row3);

  {
    // The a* are 15b signed.
@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.

-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
    int16_t out[16];
    int k;

-    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

    // Convert coefficients to bin (within out[]).
    {
@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // Intra predictions

 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 8; ++j) {
@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
  }
 }

-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 16; ++j) {
@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
  }
 }

-static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
  if (size == 4) {
    int j;
    for (j = 0; j < 4; ++j) {
      memset(dst + j * BPS, value, 4);
    }
  } else if (size == 8) {
-    Put8x8uv(value, dst);
+    Put8x8uv_SSE2(value, dst);
  } else {
-    Put16(value, dst);
+    Put16_SSE2(value, dst);
  }
 }

-static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
  int j;
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  for (j = 0; j < 8; ++j) {
@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
  }
 }

-static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
  const __m128i top_values = _mm_load_si128((const __m128i*)top);
  int j;
  for (j = 0; j < 16; ++j) {
@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
  }
 }

-static WEBP_INLINE void VerticalPred(uint8_t* dst,
-                                     const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+                                          const uint8_t* top, int size) {
  if (top != NULL) {
    if (size == 8) {
-      VE8uv(dst, top);
+      VE8uv_SSE2(dst, top);
    } else {
-      VE16(dst, top);
+      VE16_SSE2(dst, top);
    }
  } else {
-    Fill(dst, 127, size);
+    Fill_SSE2(dst, 127, size);
  }
 }

-static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
  int j;
  for (j = 0; j < 8; ++j) {
    const __m128i values = _mm_set1_epi8(left[j]);
@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
  }
 }

-static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
  int j;
  for (j = 0; j < 16; ++j) {
    const __m128i values = _mm_set1_epi8(left[j]);
@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
  }
 }

-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
-                                       const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+                                            const uint8_t* left, int size) {
  if (left != NULL) {
    if (size == 8) {
-      HE8uv(dst, left);
+      HE8uv_SSE2(dst, left);
    } else {
-      HE16(dst, left);
+      HE16_SSE2(dst, left);
    }
  } else {
-    Fill(dst, 129, size);
+    Fill_SSE2(dst, 129, size);
  }
 }

-static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
-                           const uint8_t* top, int size) {
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top, int size) {
  const __m128i zero = _mm_setzero_si128();
  int y;
  if (size == 8) {
@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
  }
 }

-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top, int size) {
  if (left != NULL) {
    if (top != NULL) {
-      TM(dst, left, top, size);
+      TM_SSE2(dst, left, top, size);
    } else {
-      HorizontalPred(dst, left, size);
+      HorizontalPred_SSE2(dst, left, size);
    }
  } else {
    // true motion without left samples (hence: with default 129 value)
@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
    // Note that if top samples are not available, the default value is
    // then 129, and not 127 as in the VerticalPred case.
    if (top != NULL) {
-      VerticalPred(dst, top, size);
+      VerticalPred_SSE2(dst, top, size);
    } else {
-      Fill(dst, 129, size);
+      Fill_SSE2(dst, 129, size);
    }
  }
 }

-static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top) {
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top) {
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
  const int DC = VP8HorizontalAdd8b(&combined) + 8;
-  Put8x8uv(DC >> 4, dst);
+  Put8x8uv_SSE2(DC >> 4, dst);
 }

-static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  const __m128i sum = _mm_sad_epu8(top_values, zero);
  const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv(DC >> 3, dst);
+  Put8x8uv_SSE2(DC >> 3, dst);
 }

-static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
  // 'left' is contiguous so we can reuse the top summation.
-  DC8uvNoLeft(dst, left);
+  DC8uvNoLeft_SSE2(dst, left);
 }

-static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
-  Put8x8uv(0x80, dst);
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+  Put8x8uv_SSE2(0x80, dst);
 }

-static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
  if (top != NULL) {
    if (left != NULL) {  // top and left present
-      DC8uv(dst, left, top);
+      DC8uv_SSE2(dst, left, top);
    } else {  // top, but no left
-      DC8uvNoLeft(dst, top);
+      DC8uvNoLeft_SSE2(dst, top);
    }
  } else if (left != NULL) {  // left but no top
-    DC8uvNoTop(dst, left);
+    DC8uvNoTop_SSE2(dst, left);
  } else {  // no top, no left, nothing.
-    DC8uvNoTopLeft(dst);
+    DC8uvNoTopLeft_SSE2(dst);
  }
 }

-static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
  const __m128i top_row = _mm_load_si128((const __m128i*)top);
  const __m128i left_row = _mm_load_si128((const __m128i*)left);
  const int DC =
      VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
-  Put16(DC >> 5, dst);
+  Put16_SSE2(DC >> 5, dst);
 }

-static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
  const __m128i top_row = _mm_load_si128((const __m128i*)top);
  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }

-static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
  // 'left' is contiguous so we can reuse the top summation.
-  DC16NoLeft(dst, left);
+  DC16NoLeft_SSE2(dst, left);
 }

-static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
-  Put16(0x80, dst);
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+  Put16_SSE2(0x80, dst);
 }

-static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
  if (top != NULL) {
    if (left != NULL) {  // top and left present
-      DC16(dst, left, top);
+      DC16_SSE2(dst, left, top);
    } else {  // top, but no left
-      DC16NoLeft(dst, top);
+      DC16NoLeft_SSE2(dst, top);
    }
  } else if (left != NULL) {  // left but no top
-    DC16NoTop(dst, left);
+    DC16NoTop_SSE2(dst, left);
  } else {  // no top, no left, nothing.
-    DC16NoTopLeft(dst);
+    DC16NoTopLeft_SSE2(dst);
  }
 }

@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1

-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // vertical
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
  }
 }

-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // horizontal
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }

-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
-  Fill(dst, dc >> 3, 4);
+  Fill_SSE2(dst, dc >> 3, 4);
 }

-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }

-static WEBP_INLINE void VR4(uint8_t* dst,
-                            const uint8_t* top) {  // Vertical-Right
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Right
  const __m128i one = _mm_set1_epi8(1);
  const int I = top[-2];
  const int J = top[-3];
@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
  DST(0, 3) = AVG3(K, J, I);
 }

-static WEBP_INLINE void VL4(uint8_t* dst,
-                            const uint8_t* top) {  // Vertical-Left
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
  DST(3, 3) = (extra_out >> 8) & 0xff;
 }

-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-right
  const __m128i one = _mm_set1_epi8(1);
  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }

-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }

-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
-  DC4(I4DC4 + dst, top);
-  TM4(I4TM4 + dst, top);
-  VE4(I4VE4 + dst, top);
-  HE4(I4HE4 + dst, top);
-  RD4(I4RD4 + dst, top);
-  VR4(I4VR4 + dst, top);
-  LD4(I4LD4 + dst, top);
-  VL4(I4VL4 + dst, top);
-  HD4(I4HD4 + dst, top);
-  HU4(I4HU4 + dst, top);
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+  DC4_SSE2(I4DC4 + dst, top);
+  TM4_SSE2(I4TM4 + dst, top);
+  VE4_SSE2(I4VE4 + dst, top);
+  HE4_SSE2(I4HE4 + dst, top);
+  RD4_SSE2(I4RD4 + dst, top);
+  VR4_SSE2(I4VR4 + dst, top);
+  LD4_SSE2(I4LD4 + dst, top);
+  VL4_SSE2(I4VL4 + dst, top);
+  HD4_SSE2(I4HD4 + dst, top);
+  HU4_SSE2(I4HU4 + dst, top);
 }

 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
  // U block
-  DC8uvMode(C8DC8 + dst, left, top);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
  // V block
  dst += 8;
  if (top != NULL) top += 8;
  if (left != NULL) left += 16;
-  DC8uvMode(C8DC8 + dst, left, top);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
 }

 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
-  DC16Mode(I16DC16 + dst, left, top);
-  VerticalPred(I16VE16 + dst, top, 16);
-  HorizontalPred(I16HE16 + dst, left, 16);
-  TrueMotion(I16TM16 + dst, left, top, 16);
+static void Intra16Preds_SSE2(uint8_t* dst,
+                              const uint8_t* left, const uint8_t* top) {
+  DC16Mode_SSE2(I16DC16 + dst, left, top);
+  VerticalPred_SSE2(I16VE16 + dst, top, 16);
+  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+  TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
 }

 //------------------------------------------------------------------------------
 // Metric

-static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
-                                              __m128i* const sum) {
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+                                                   const __m128i b,
+                                                   __m128i* const sum) {
  // take abs(a-b) in 8b
  const __m128i a_b = _mm_subs_epu8(a, b);
  const __m128i b_a = _mm_subs_epu8(b, a);
@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
  *sum = _mm_add_epi32(sum1, sum2);
 }

-static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
-                                int num_pairs) {
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+                                     int num_pairs) {
  __m128i sum = _mm_setzero_si128();
  int32_t tmp[4];
  int i;
@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
    __m128i sum1, sum2;
-    SubtractAndAccumulate(a0, b0, &sum1);
-    SubtractAndAccumulate(a1, b1, &sum2);
+    SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+    SubtractAndAccumulate_SSE2(a1, b1, &sum2);
    sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
    a += 2 * BPS;
    b += 2 * BPS;
@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }

-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN(a, b, 8);
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 8);
 }

-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN(a, b, 4);
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 4);
 }

 #define LOAD_8x16b(ptr) \
  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)

-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();
  int num_pairs = 4;
  __m128i sum = zero;
@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 }
 #undef LOAD_8x16b

-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();

  // Load values. Note that we read 8 pixels instead of 4,
@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {

 //------------------------------------------------------------------------------

-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
  const __m128i mask = _mm_set1_epi16(0x00ff);
  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
-                      const uint16_t* const w) {
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+                           const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }

-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE2(a, b, w);
  return abs(diff_sum) >> 5;
 }

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_SSE2(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //

-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       const uint16_t* const sharpen,
-                                       const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                                            const uint16_t* const sharpen,
+                                            const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
 }

-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
 }

-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+                                 const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
  int nz;
  const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
  return nz;
 }

@ -1346,24 +1354,24 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
-  VP8CollectHistogram = CollectHistogram;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
+  VP8CollectHistogram = CollectHistogram_SSE2;
+  VP8EncPredLuma16 = Intra16Preds_SSE2;
+  VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+  VP8EncPredLuma4 = Intra4Preds_SSE2;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+  VP8ITransform = ITransform_SSE2;
+  VP8FTransform = FTransform_SSE2;
+  VP8FTransform2 = FTransform2_SSE2;
+  VP8FTransformWHT = FTransformWHT_SSE2;
+  VP8SSE16x16 = SSE16x16_SSE2;
+  VP8SSE16x8 = SSE16x8_SSE2;
+  VP8SSE8x8 = SSE8x8_SSE2;
+  VP8SSE4x4 = SSE4x4_SSE2;
+  VP8TDisto4x4 = Disto4x4_SSE2;
+  VP8TDisto16x16 = Disto16x16_SSE2;
+  VP8Mean16x4 = Mean16x4_SSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/enc_sse41.c
+++ b/src/dsp/enc_sse41.c
@ -11,21 +11,21 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE41)
 #include <smmintrin.h>
 #include <stdlib.h>  // for abs()

-#include "./common_sse2.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.

-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+                                   int start_block, int end_block,
+                                   VP8Histogram* const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
-                      const uint16_t* const w) {
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+                            const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;

@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }

-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE41(a, b, w);
  return abs(diff_sum) >> 5;
 }

-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+                            const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_SSE41(a + x + y, b + x + y, w);
    }
  }
  return D;
@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
               2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
               2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)

-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       const uint16_t* const sharpen,
-                                       const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                                             const uint16_t* const sharpen,
+                                             const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i out0, out8;
@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],

 #undef PSHUFB_CST

-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                               const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
 }

-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+                                  const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
 }

-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+                                 const VP8Matrix* const mtx) {
  int nz;
  const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
  return nz;
 }

@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],

 extern void VP8EncDspInitSSE41(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
-  VP8CollectHistogram = CollectHistogram;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram_SSE41;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+  VP8TDisto4x4 = Disto4x4_SSE41;
+  VP8TDisto16x16 = Disto16x16_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/filters.c
+++ b/src/dsp/filters.c
@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@ -20,16 +20,17 @@
 // Helpful macro.

 # define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
+  assert((in) != NULL);                                                        \
+  assert((out) != NULL);                                                       \
  assert(width > 0);                                                           \
  assert(height > 0);                                                          \
  assert(stride >= width);                                                     \
  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
  (void)height;  // Silence unused warning.

-static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
-                                    uint8_t* dst, int length, int inverse) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
+                                      uint8_t* dst, int length, int inverse) {
  int i;
  if (inverse) {
    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@ -41,7 +42,44 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
 //------------------------------------------------------------------------------
 // Horizontal filter.

-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
+                                             int width, int height, int stride,
+                                             int row, int num_rows,
+                                             int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLine_C(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
                                           int width, int height, int stride,
                                           int row, int num_rows,
                                           int inverse, uint8_t* out) {
@ -53,48 +91,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
  out += start_offset;
  preds = inverse ? out : in;

-  if (row == 0) {
-    // Leftmost pixel is the same as input for topmost scanline.
-    out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
-
-  // Filter line-by-line.
-  while (row < last_row) {
-    // Leftmost pixel is predicted from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-    ++row;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
-}
-
-//------------------------------------------------------------------------------
-// Vertical filter.
-
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
-
  if (row == 0) {
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    in += stride;
    out += stride;
@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,

  // Filter line-by-line.
  while (row < last_row) {
-    PredictLine(in, preds, out, width, inverse);
+    PredictLine_C(in, preds, out, width, inverse);
    ++row;
    preds += stride;
    in += stride;
    out += stride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Gradient filter.

-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
  const int g = a + b - c;
  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }

-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    preds += stride;
    in += stride;
@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
  while (row < last_row) {
    int w;
    // leftmost pixel: predict from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in, preds - stride, out, 1, inverse);
    for (w = 1; w < width; ++w) {
-      const int pred = GradientPredictor(preds[w - 1],
-                                         preds[w - stride],
-                                         preds[w - stride - 1]);
+      const int pred = GradientPredictor_C(preds[w - 1],
+                                           preds[w - stride],
+                                           preds[w - stride - 1]);
      out[w] = in[w] + (inverse ? pred : -pred);
    }
    ++row;
@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
    out += stride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef SANITY_CHECK

 //------------------------------------------------------------------------------

-static void HorizontalFilter(const uint8_t* data, int width, int height,
+#if !WEBP_NEON_OMIT_C_CODE
+static void HorizontalFilter_C(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
+                       filtered_data);
+}
+
+static void VerticalFilter_C(const uint8_t* data, int width, int height,
                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+  DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }

-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void GradientFilter_C(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }
-
-
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
-}
-
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------

-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                                 uint8_t* out, int width) {
  uint8_t pred = (prev == NULL) ? 0 : prev[0];
  int i;
  for (i = 0; i < width; ++i) {
@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
  }
 }

-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_C(NULL, in, out, width);
  } else {
    int i;
    for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_C(NULL, in, out, width);
  } else {
    uint8_t top = prev[0], top_left = top, left = top;
    int i;
    for (i = 0; i < width; ++i) {
      top = prev[i];  // need to read this first, in case prev==out
-      left = in[i] + GradientPredictor(left, top, top_left);
+      left = in[i] + GradientPredictor_C(left, top, top_left);
      top_left = top;
      out[i] = left;
    }
@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;

  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
+#endif
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;

  WebPFilters[WEBP_FILTER_NONE] = NULL;
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
+#endif

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
      VP8FiltersInitSSE2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8FiltersInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8FiltersInitMIPSdspR2();
@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8FiltersInitNEON();
+  }
+#endif
+
+  assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
+  assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
+
  filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/filters_mips_dsp_r2.c
+++ b/src/dsp/filters_mips_dsp_r2.c
@ -12,11 +12,11 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut (djordje.pesut@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "../dsp/dsp.h"
+#include "src/dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@ -101,8 +101,8 @@
    );                                                                         \
  } while (0)

-static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
-                                    int length) {
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+                                              int length) {
  DO_PREDICT_LINE(src, dst, length, 0);
 }

@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
    }                                                                          \
  } while (0)

-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+                                                     int width, int height,
+                                                     int stride,
+                                                     int row, int num_rows,
+                                                     uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
  if (row == 0) {
    // Leftmost pixel is the same as input for topmost scanline.
    out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
    row = 1;
    preds += stride;
    in += stride;
@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE

-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+                                       int width, int height,
+                                       int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                               filtered_data);
 }

 //------------------------------------------------------------------------------
@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
    }                                                                          \
  } while (0)

-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+                                                   int width, int height,
+                                                   int stride,
+                                                   int row, int num_rows,
+                                                   uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE

-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
 }

 //------------------------------------------------------------------------------
 // Gradient filter.

-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
  int temp0;
  __asm__ volatile (
    "addu             %[temp0],   %[a],       %[b]        \n\t"
@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
      int w;                                                                   \
      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
      for (w = 1; w < width; ++w) {                                            \
-        const int pred = GradientPredictor(PREDS[w - 1],                       \
-                                           PREDS[w - stride],                  \
-                                           PREDS[w - stride - 1]);             \
+        const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1],             \
+                                                     PREDS[w - stride],        \
+                                                     PREDS[w - stride - 1]);   \
        out[w] = in[w] OPERATION pred;                                         \
      }                                                                        \
      ++row;                                                                   \
@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
    }                                                                          \
  } while (0)

-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+                                       int width, int height, int stride,
+                                       int row, int num_rows, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
    row = 1;
    preds += stride;
    in += stride;
@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE

-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
 }

 //------------------------------------------------------------------------------

-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                         uint8_t* out, int width) {
 out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
 DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
 }

-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
  } else {
    DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
  }
 }

-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
  } else {
    uint8_t top = prev[0], top_left = top, left = top;
    int i;
    for (i = 0; i < width; ++i) {
      top = prev[i];  // need to read this first, in case prev==dst
-      left = in[i] + GradientPredictor(left, top, top_left);
+      left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
      top_left = top;
      out[i] = left;
    }
@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;

-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/filters_msa.c
+++ b/src/dsp/filters_msa.c
@ -11,11 +11,11 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MSA)

-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"

 #include <assert.h>

@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 // Horrizontal filter

-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+                                 int stride, uint8_t* filtered_data) {
  const uint8_t* preds = data;
  const uint8_t* in = data;
  uint8_t* out = filtered_data;
@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 }


-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
  const uint8_t* in = data;
  const uint8_t* preds = data;
  uint8_t* out = filtered_data;
@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
 //------------------------------------------------------------------------------
 // Vertical filter

-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
  const uint8_t* in = data;
  const uint8_t* preds = data;
  uint8_t* out = filtered_data;
@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
 extern void VP8FiltersInitMSA(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
 }

 #else  // !WEBP_USE_MSA
--- a/src/dsp/filters_neon.c
+++ b/src/dsp/filters_neon.c
@ -11,12 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_NEON)

 #include <assert.h>
-#include "./neon.h"
+#include "src/dsp/neon.h"

 //------------------------------------------------------------------------------
 // Helpful macros.
@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
 }

 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+                                int stride, uint8_t* filtered_data) {
  DoVerticalFilter_NEON(data, width, height, stride, 0, height,
                        filtered_data);
 }
@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
 }

 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+                                int stride, uint8_t* filtered_data) {
  DoGradientFilter_NEON(data, width, height, stride, 0, height,
                        filtered_data);
 }
@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
 // GradientUnfilter_NEON is correct but slower than the C-version,
 // at least on ARM64. For armv7, it's a wash.
 // So best is to disable it for now, but keep the idea around...
-// #define USE_GRADIENT_UNFILTER
+#if !defined(USE_GRADIENT_UNFILTER)
+#define USE_GRADIENT_UNFILTER 0   // ALTERNATE_CODE
+#endif

-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
 #define GRAD_PROCESS_LANE(L)  do {                                             \
  const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
  const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
 #undef GRAD_PROCESS_LANE

 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
-                                 uint8_t* out, int width) {
+                                  uint8_t* out, int width) {
  if (prev == NULL) {
    HorizontalUnfilter_NEON(NULL, in, out, width);
  } else {
@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
 #endif

--- a/src/dsp/filters_sse2.c
+++ b/src/dsp/filters_sse2.c
@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

 #if defined(WEBP_USE_SSE2)

@ -24,16 +24,16 @@
 // Helpful macro.

 # define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
+  assert((in) != NULL);                                                        \
+  assert((out) != NULL);                                                       \
  assert(width > 0);                                                           \
  assert(height > 0);                                                          \
  assert(stride >= width);                                                     \
  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
  (void)height;  // Silence unused warning.

-static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
-                           uint8_t* dst, int length) {
+static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
+                                uint8_t* dst, int length) {
  int i;
  const int max_pos = length & ~31;
  assert(length >= 0);
@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
 }

 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
  int i;
  const int max_pos = length & ~31;
  assert(length >= 0);
@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
 //------------------------------------------------------------------------------
 // Horizontal filter.

-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
+                                                int width, int height,
+                                                int stride,
+                                                int row, int num_rows,
+                                                uint8_t* out) {
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
  if (row == 0) {
    // Leftmost pixel is the same as input for topmost scanline.
    out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
  while (row < last_row) {
    // Leftmost pixel is predicted from above.
    out[0] = in[0] - in[-stride];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
    ++row;
    in += stride;
    out += stride;
@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.

-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,

  // Filter line-by-line.
  while (row < last_row) {
-    PredictLineTop(in, in - stride, out, width);
+    PredictLineTop_SSE2(in, in - stride, out, width);
    ++row;
    in += stride;
    out += stride;
@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Gradient filter.

-static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
  const int g = a + b - c;
  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }

-static void GradientPredictDirect(const uint8_t* const row,
-                                  const uint8_t* const top,
-                                  uint8_t* const out, int length) {
+static void GradientPredictDirect_SSE2(const uint8_t* const row,
+                                       const uint8_t* const top,
+                                       uint8_t* const out, int length) {
  const int max_pos = length & ~7;
  int i;
  const __m128i zero = _mm_setzero_si128();
@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
    _mm_storel_epi64((__m128i*)(out + i), H);
  }
  for (; i < length; ++i) {
-    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
  }
 }

-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
  // Filter line-by-line.
  while (row < last_row) {
    out[0] = in[0] - in[-stride];
-    GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
+    GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
    ++row;
    in += stride;
    out += stride;
@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,

 //------------------------------------------------------------------------------

-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
+                                  int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
+                          filtered_data);
 }

-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
+                                int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
 }

-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
+                                int stride, uint8_t* filtered_data) {
+  DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
 }

 //------------------------------------------------------------------------------
 // Inverse transforms

-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                    uint8_t* out, int width) {
  int i;
  __m128i last;
  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
 }

-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_SSE2(NULL, in, out, width);
  } else {
    int i;
    const int max_pos = width & ~31;
@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
  }
 }

-static void GradientPredictInverse(const uint8_t* const in,
-                                   const uint8_t* const top,
-                                   uint8_t* const row, int length) {
+static void GradientPredictInverse_SSE2(const uint8_t* const in,
+                                        const uint8_t* const top,
+                                        uint8_t* const row, int length) {
  if (length > 0) {
    int i;
    const int max_pos = length & ~7;
@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
      _mm_storel_epi64((__m128i*)&row[i], out);
    }
    for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+      row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
    }
  }
 }

-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_SSE2(NULL, in, out, width);
  } else {
    out[0] = in[0] + prev[0];  // predict from above
-    GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
+    GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
  }
 }

@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;

-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -13,14 +13,15 @@
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)

-#include "./dsp.h"
+#include "src/dsp/dsp.h"

+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"

 #define MAX_DIFF_COST (1e30f)

@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }

-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
+// inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors

-static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
  (void)top;
  (void)left;
  return ARGB_BLACK;
 }
-static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
  (void)top;
  return left;
 }
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[0];
 }
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[1];
 }
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[-1];
 }
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average3(left, top[0], top[1]);
  return pred;
 }
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[-1]);
  return pred;
 }
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[0]);
  return pred;
 }
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[-1], top[0]);
  (void)left;
  return pred;
 }
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[0], top[1]);
  (void)left;
  return pred;
 }
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
  return pred;
 }
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Select(top[0], left, top[-1]);
  return pred;
 }
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  return pred;
 }
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  return pred;
 }

-GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
-static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
-                          int num_pixels, uint32_t* out) {
+GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
  int i;
  uint32_t left = out[-1];
  for (i = 0; i < num_pixels; ++i) {
@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
  }
  (void)upper;
 }
-GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
-GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
-GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
-GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
-GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
-GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
-GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
-GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
-GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
-GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
-GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
-GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
+GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)

 //------------------------------------------------------------------------------

 // Inverse prediction.
-static void PredictorInverseTransform(const VP8LTransform* const transform,
-                                      int y_start, int y_end,
-                                      const uint32_t* in, uint32_t* out) {
+static void PredictorInverseTransform_C(const VP8LTransform* const transform,
+                                        int y_start, int y_end,
+                                        const uint32_t* in, uint32_t* out) {
  const int width = transform->xsize_;
  if (y_start == 0) {  // First Row follows the L (mode=1) mode.
-    PredictorAdd0(in, NULL, 1, out);
-    PredictorAdd1(in + 1, NULL, width - 1, out + 1);
+    PredictorAdd0_C(in, NULL, 1, out);
+    PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
    in += width;
    out += width;
    ++y_start;
@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
      const uint32_t* pred_mode_src = pred_mode_base;
      int x = 1;
      // First pixel follows the T (mode=2) mode.
-      PredictorAdd2(in, out - width, 1, out);
+      PredictorAdd2_C(in, out - width, 1, out);
      // .. the rest:
      while (x < width) {
        const VP8LPredictorAddSubFunc pred_func =
@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
    const uint32_t argb = src[i];
    const uint32_t green = argb >> 8;
    const uint32_t red = argb >> 16;
-    int new_red = red;
-    int new_blue = argb;
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
    new_red += ColorTransformDelta(m->green_to_red_, green);
    new_red &= 0xff;
    new_blue += ColorTransformDelta(m->green_to_blue_, green);
@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
 }

 // Color space inverse transform.
-static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
-                                       int y_start, int y_end,
-                                       const uint32_t* src, uint32_t* dst) {
+static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
+                                         int y_start, int y_end,
+                                         const uint32_t* src, uint32_t* dst) {
  const int width = transform->xsize_;
  const int tile_width = 1 << transform->bits_;
  const int mask = tile_width - 1;
@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform,               \
  }                                                                            \
 }

-COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
-                    VP8GetARGBIndex, VP8GetARGBValue)
-COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
-                    8b, VP8GetAlphaIndex, VP8GetAlphaValue)
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
+                    uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
+                    uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)

 #undef COLOR_INDEX_INVERSE

@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
      VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
      break;
    case PREDICTOR_TRANSFORM:
-      PredictorInverseTransform(transform, row_start, row_end, in, out);
+      PredictorInverseTransform_C(transform, row_start, row_end, in, out);
      if (row_end != transform->ysize_) {
        // The last predicted row in this iteration will be the top-pred row
        // for the first row in next iteration.
@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
      }
      break;
    case CROSS_COLOR_TRANSFORM:
-      ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
+      ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
      break;
    case COLOR_INDEXING_TRANSFORM:
      if (in == out && transform->bits_ > 0) {
@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
            VP8LSubSampleSize(transform->xsize_, transform->bits_);
        uint32_t* const src = out + out_stride - in_stride;
        memmove(src, out, in_stride * sizeof(*src));
-        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+        ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
      } else {
-        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+        ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
      }
      break;
  }
@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
    const uint32_t argb = *src++;
    const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
    const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
    *dst++ = ba;
    *dst++ = rg;
 #else
@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
    const uint32_t argb = *src++;
    const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
    const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
    *dst++ = gb;
    *dst++ = rg;
 #else
@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
    const uint32_t* const src_end = src + num_pixels;
    while (src < src_end) {
      const uint32_t argb = *src++;
-
-#if !defined(WORDS_BIGENDIAN)
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
      WebPUint32ToMem(dst, BSwap32(argb));
-#else  // WEBP_REFERENCE_IMPLEMENTATION
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
-#else  // WORDS_BIGENDIAN
-      dst[0] = (argb >>  0) & 0xff;
-      dst[1] = (argb >>  8) & 0xff;
-      dst[2] = (argb >> 16) & 0xff;
-      dst[3] = (argb >> 24) & 0xff;
-#endif
      dst += sizeof(argb);
    }
  } else {
@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
    (VP8CPUInfo)&lossless_last_cpuinfo_used;

-#define COPY_PREDICTOR_ARRAY(IN, OUT) do {              \
-  (OUT)[0] = IN##0;                                     \
-  (OUT)[1] = IN##1;                                     \
-  (OUT)[2] = IN##2;                                     \
-  (OUT)[3] = IN##3;                                     \
-  (OUT)[4] = IN##4;                                     \
-  (OUT)[5] = IN##5;                                     \
-  (OUT)[6] = IN##6;                                     \
-  (OUT)[7] = IN##7;                                     \
-  (OUT)[8] = IN##8;                                     \
-  (OUT)[9] = IN##9;                                     \
-  (OUT)[10] = IN##10;                                   \
-  (OUT)[11] = IN##11;                                   \
-  (OUT)[12] = IN##12;                                   \
-  (OUT)[13] = IN##13;                                   \
-  (OUT)[14] = IN##0; /* <- padding security sentinels*/ \
-  (OUT)[15] = IN##0;                                    \
+#define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
+  (OUT)[0] = IN##0_C;                                     \
+  (OUT)[1] = IN##1_C;                                     \
+  (OUT)[2] = IN##2_C;                                     \
+  (OUT)[3] = IN##3_C;                                     \
+  (OUT)[4] = IN##4_C;                                     \
+  (OUT)[5] = IN##5_C;                                     \
+  (OUT)[6] = IN##6_C;                                     \
+  (OUT)[7] = IN##7_C;                                     \
+  (OUT)[8] = IN##8_C;                                     \
+  (OUT)[9] = IN##9_C;                                     \
+  (OUT)[10] = IN##10_C;                                   \
+  (OUT)[11] = IN##11_C;                                   \
+  (OUT)[12] = IN##12_C;                                   \
+  (OUT)[13] = IN##13_C;                                   \
+  (OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
+  (OUT)[15] = IN##0_C;                                    \
 } while (0);

 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)

+#if !WEBP_NEON_OMIT_C_CODE
  VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;

  VP8LTransformColorInverse = VP8LTransformColorInverse_C;

-  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
  VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+#endif
+
  VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
-  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;

-  VP8LMapColor32b = MapARGB;
-  VP8LMapColor8b = MapAlpha;
+  VP8LMapColor32b = MapARGB_C;
+  VP8LMapColor8b = MapAlpha_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
      VP8LDspInitSSE2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8LDspInitMIPSdspR2();
@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LDspInitNEON();
+  }
+#endif
+
+  assert(VP8LAddGreenToBlueAndRed != NULL);
+  assert(VP8LTransformColorInverse != NULL);
+  assert(VP8LConvertBGRAToRGBA != NULL);
+  assert(VP8LConvertBGRAToRGB != NULL);
+  assert(VP8LConvertBGRAToBGR != NULL);
+  assert(VP8LConvertBGRAToRGBA4444 != NULL);
+  assert(VP8LConvertBGRAToRGB565 != NULL);
+  assert(VP8LMapColor32b != NULL);
+  assert(VP8LMapColor8b != NULL);
+
  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #undef COPY_PREDICTOR_ARRAY
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -15,18 +15,18 @@
 #ifndef WEBP_DSP_LOSSLESS_H_
 #define WEBP_DSP_LOSSLESS_H_

-#include "../webp/types.h"
-#include "../webp/decode.h"
+#include "src/webp/types.h"
+#include "src/webp/decode.h"

-#include "../enc/histogram_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/histogram_enc.h"
+#include "src/utils/utils.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../enc/delta_palettization_enc.h"
+#include "src/enc/delta_palettization_enc.h"
 #endif  // WEBP_EXPERIMENTAL_FEATURES

 //------------------------------------------------------------------------------
@ -124,7 +124,7 @@ void VP8LDspInit(void);
 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
-                                       uint32_t* const dst, int num_pixels);
+                                       uint32_t* dst, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
    const uint32_t* argb, int stride,
--- a/src/dsp/lossless_common.h
+++ b/src/dsp/lossless_common.h
@ -16,9 +16,9 @@
 #ifndef WEBP_DSP_LOSSLESS_COMMON_H_
 #define WEBP_DSP_LOSSLESS_COMMON_H_

-#include "../webp/types.h"
+#include "src/webp/types.h"

-#include "../utils/utils.h"
+#include "src/utils/utils.h"

 #ifdef __cplusplus
 extern "C" {
--- a/Show More
+++ b/Show More