README.wasm: add instructions for enabling mulhi

Change-Id: I1e9dd737f06ad76f73824352291a6e129ca5ded1
WebPMemToUint32: remove ptr cast to int
2026-04-10 06:40:02 +02:00 · 2017-11-02 11:20:09 -07:00 · 2017-10-31 18:24:54 -07:00 · 2017-10-30 20:40:48 -07:00 · 2017-10-28 11:49:18 -07:00 · 2017-10-28 11:49:18 -07:00
209 changed files with 5616 additions and 5472 deletions
--- a/Android.mk
+++ b/Android.mk
@@ -55,6 +55,9 @@ dsp_dec_srcs := \
    src/dsp/alpha_processing_neon.$(NEON) \
    src/dsp/alpha_processing_sse2.c \
    src/dsp/alpha_processing_sse41.c \
+    src/dsp/argb.c \
+    src/dsp/argb_mips_dsp_r2.c \
+    src/dsp/argb_sse2.c \
    src/dsp/cpu.c \
    src/dsp/dec.c \
    src/dsp/dec_clip_tables.c \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,17 +4,17 @@ project(libwebp C)

 # Options for coder / decoder executables.
 option(WEBP_ENABLE_SIMD "Enable any SIMD optimization." ON)
+option(WEBP_ENABLE_WASM "Enable WebAssembly optimizations." OFF)
 option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." OFF)
 option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." OFF)
 option(WEBP_BUILD_GIF2WEBP "Build the gif2webp conversion tool." OFF)
 option(WEBP_BUILD_IMG2WEBP "Build the img2webp animation tool." OFF)
 option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." OFF)
 option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
-option(WEBP_ENABLE_NEAR_LOSSLESS "Enable near-lossless encoding" ON)
 option(WEBP_EXPERIMENTAL_FEATURES "Build with experimental features." OFF)
 option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces." OFF)

-if(WEBP_BUILD_WEBP_JS)
+if(WEBP_BUILD_WEBP_JS OR WEBP_ENABLE_WASM)
  set(WEBP_ENABLE_SIMD OFF)
 endif()

@@ -27,14 +27,19 @@ if(NOT CMAKE_BUILD_TYPE)
  )
 endif()

-# Include dependencies.
-include(cmake/deps.cmake)
-include(GNUInstallDirs)
+include(cmake/config.h.cmake)
+
+# Extract the version of the library.
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/configure.ac SOURCE_FILE)
+string(REGEX MATCH "[0-9.]+" WEBP_VERSION ${SOURCE_FILE})

 ################################################################################
 # Options.
 if(WEBP_ENABLE_SWAP_16BIT_CSP)
-  add_definitions(-DWEBP_SWAP_16BIT_CSP=1)
+  add_definitions(-DWEBP_SWAP_16BIT_CSP)
+endif()
+if(WEBP_ENABLE_WASM)
+  add_definitions(-DWEBP_USE_WASM)
 endif()

 ################################################################################
@@ -49,10 +54,7 @@ if(ANDROID)
  set(WEBP_DEP_INCLUDE_DIRS ${WEBP_DEP_INCLUDE_DIRS}
    ${ANDROID_NDK}/sources/android/cpufeatures
  )
-  add_definitions(-DHAVE_CPU_FEATURES_H=1)
-  set(HAVE_CPU_FEATURES_H 1)
-else()
-  set(HAVE_CPU_FEATURES_H 0)
+  add_definitions(-DHAVE_CPU_FEATURES_H)
 endif()

 ################################################################################
@@ -104,13 +106,8 @@ endforeach()

 ### Define the mandatory libraries.
 # Build the webpdecoder library.
-if(MSVC)
-  # avoid security warnings for e.g., fopen() used in the examples.
-  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-else()
-  add_definitions(-Wall)
-endif()
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${WEBP_DEP_INCLUDE_DIRS})
+add_definitions(-Wall)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ ${WEBP_DEP_INCLUDE_DIRS})
 add_library(webpdecode OBJECT ${WEBP_DEC_SRCS})
 add_library(webpdspdecode OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS})
 add_library(webputilsdecode OBJECT ${WEBP_UTILS_COMMON_SRCS}
@@ -121,32 +118,13 @@ target_link_libraries(webpdecoder ${WEBP_DEP_LIBRARIES})

 # Build the webp library.
 add_library(webpencode OBJECT ${WEBP_ENC_SRCS})
-target_include_directories(
-  webpencode PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
-                     ${CMAKE_CURRENT_SOURCE_DIR}/src)
 add_library(webpdsp OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS}
-                           ${WEBP_DSP_ENC_SRCS})
-target_include_directories(webpdsp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
-                                           ${CMAKE_CURRENT_SOURCE_DIR})
+  ${WEBP_DSP_ENC_SRCS})
 add_library(webputils OBJECT ${WEBP_UTILS_COMMON_SRCS} ${WEBP_UTILS_DEC_SRCS}
-                             ${WEBP_UTILS_ENC_SRCS})
-target_include_directories(webputils PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
-                                             ${CMAKE_CURRENT_SOURCE_DIR})
+  ${WEBP_UTILS_ENC_SRCS})
 add_library(webp $<TARGET_OBJECTS:webpdecode> $<TARGET_OBJECTS:webpdsp>
-                 $<TARGET_OBJECTS:webpencode> $<TARGET_OBJECTS:webputils>)
-if(XCODE)
-  libwebp_add_stub_file(webp)
-endif()
+  $<TARGET_OBJECTS:webpencode> $<TARGET_OBJECTS:webputils>)
 target_link_libraries(webp ${WEBP_DEP_LIBRARIES})
-target_include_directories(
-  webp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
-  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
-         $<INSTALL_INTERFACE:include>)
-set_target_properties(
-  webp
-  PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/encode.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")

 # Make sure the OBJECT libraries are built with position independent code
 # (it is not ON by default).
@@ -156,17 +134,6 @@ set_target_properties(webpdecode webpdspdecode webputilsdecode
 # Build the webp demux library.
 add_library(webpdemux ${WEBP_DEMUX_SRCS})
 target_link_libraries(webpdemux webp)
-target_include_directories(
-  webpdemux PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
-  PUBLIC $<INSTALL_INTERFACE:include>)
-set_target_properties(
-  webpdemux
-  PROPERTIES
-    PUBLIC_HEADER
-    "${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")

 # Set the version numbers.
 function(parse_version FILE NAME VAR)
@@ -178,13 +145,13 @@ function(parse_version FILE NAME VAR)
  set(${VAR} "${VERSION}" PARENT_SCOPE)
 endfunction()
 parse_version(Makefile.am webp WEBP_WEBP_SOVERSION)
-set_target_properties(webp PROPERTIES VERSION ${PACKAGE_VERSION}
+set_target_properties(webp PROPERTIES VERSION ${WEBP_VERSION}
  SOVERSION ${WEBP_WEBP_SOVERSION})
 parse_version(Makefile.am webpdecoder WEBP_DECODER_SOVERSION)
-set_target_properties(webpdecoder PROPERTIES VERSION ${PACKAGE_VERSION}
+set_target_properties(webpdecoder PROPERTIES VERSION ${WEBP_VERSION}
  SOVERSION ${WEBP_DECODER_SOVERSION})
 parse_version(demux/Makefile.am webpdemux WEBP_DEMUX_SOVERSION)
-set_target_properties(webpdemux PROPERTIES VERSION ${PACKAGE_VERSION}
+set_target_properties(webpdemux PROPERTIES VERSION ${WEBP_VERSION}
  SOVERSION ${WEBP_DEMUX_SOVERSION})

 # Define the libraries to install.
@@ -200,9 +167,11 @@ math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
 foreach(I_FILE RANGE ${WEBP_SIMD_FILES_TO_INCLUDE_RANGE})
  list(GET WEBP_SIMD_FILES_TO_INCLUDE ${I_FILE} FILE)
  list(GET WEBP_SIMD_FLAGS_TO_INCLUDE ${I_FILE} SIMD_COMPILE_FLAG)
-  set_source_files_properties(${FILE} PROPERTIES
-    COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
-  )
+  if(NOT ${SIMD_COMPILE_FLAG} STREQUAL "NOTFOUND")
+    set_source_files_properties(${FILE} PROPERTIES
+      COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
+    )
+  endif()
 endforeach()

 # Build the executables if asked for.
@@ -231,10 +200,6 @@ if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP OR
    "imageenc_[^ ]*")
  add_library(imageenc ${IMAGEENC_SRCS})
  target_link_libraries(imageenc webp)
-
-  set_property(TARGET exampleutil imageioutil imagedec imageenc
-    PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_DWEBP)
@@ -245,8 +210,6 @@ if(WEBP_BUILD_DWEBP)
  add_executable(dwebp ${DWEBP_SRCS})
  target_link_libraries(dwebp exampleutil imagedec imageenc webpdecoder)
  install(TARGETS dwebp RUNTIME DESTINATION bin)
-  set_property(TARGET dwebp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_CWEBP)
@@ -257,12 +220,6 @@ if(WEBP_BUILD_CWEBP)
  add_executable(cwebp ${CWEBP_SRCS})
  target_link_libraries(cwebp exampleutil imagedec webp)
  install(TARGETS cwebp RUNTIME DESTINATION bin)
-  set_property(TARGET cwebp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
-endif()
-
-if(WEBP_BUILD_GIF2WEBP AND NOT GIF_FOUND)
-  unset(WEBP_BUILD_GIF2WEBP CACHE)
 endif()

 if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
@@ -271,13 +228,8 @@ if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
  add_library(webpmux ${WEBP_MUX_SRCS})
  target_link_libraries(webpmux webp)
  parse_version(mux/Makefile.am webpmux WEBP_MUX_SOVERSION)
-  set_target_properties(webpmux PROPERTIES VERSION ${PACKAGE_VERSION}
+  set_target_properties(webpmux PROPERTIES VERSION ${WEBP_VERSION}
    SOVERSION ${WEBP_MUX_SOVERSION})
-  set_target_properties(
-    webpmux
-    PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h;\
-${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h;")
  list(APPEND INSTALLED_LIBRARIES webpmux)
 endif()

@@ -290,8 +242,6 @@ if(WEBP_BUILD_GIF2WEBP)
  target_link_libraries(gif2webp exampleutil imageioutil webp webpmux
    ${WEBP_DEP_GIF_LIBRARIES})
  install(TARGETS gif2webp RUNTIME DESTINATION bin)
-  set_property(TARGET gif2webp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_IMG2WEBP)
@@ -302,8 +252,6 @@ if(WEBP_BUILD_IMG2WEBP)
  add_executable(img2webp ${IMG2WEBP_SRCS})
  target_link_libraries(img2webp exampleutil imagedec imageioutil webp webpmux)
  install(TARGETS img2webp RUNTIME DESTINATION bin)
-  set_property(TARGET img2webp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if (WEBP_BUILD_WEBPINFO)
@@ -314,8 +262,6 @@ if (WEBP_BUILD_WEBPINFO)
  add_executable(webpinfo ${WEBPINFO_SRCS})
  target_link_libraries(webpinfo exampleutil imageioutil)
  install(TARGETS webpinfo RUNTIME DESTINATION bin)
-  set_property(TARGET webpinfo PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

 if(WEBP_BUILD_WEBP_JS)
@@ -323,7 +269,6 @@ if(WEBP_BUILD_WEBP_JS)
  add_executable(webp_js
                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
  target_link_libraries(webp_js webpdecoder SDL)
-  set(WEBP_HAVE_SDL 1)
  set_target_properties(webp_js PROPERTIES LINK_FLAGS
      "-s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
  set_target_properties(webp_js PROPERTIES OUTPUT_NAME webp)
@@ -341,33 +286,23 @@ if(WEBP_BUILD_WEBP_JS)
  target_compile_definitions(webpdecoder PUBLIC EMSCRIPTEN)
 endif()

-# Generate the config.h file.
-configure_file(${CMAKE_CURRENT_LIST_DIR}/cmake/config.h.in
-  ${CMAKE_CURRENT_BINARY_DIR}/src/webp/config.h)
-add_definitions(-DHAVE_CONFIG_H)
-# The webp folder is included as we reference config.h as
-# ../webp/config.h or webp/config.h
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
 # Install the different headers and libraries.
-install(
-  TARGETS ${INSTALLED_LIBRARIES}
-  EXPORT WebPTargets
-  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/webp
-  INCLUDES
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
-set(ConfigPackageLocation ${CMAKE_INSTALL_DATADIR}/WebP/cmake/)
-install(EXPORT WebPTargets NAMESPACE WebP::
-        DESTINATION ${ConfigPackageLocation})
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/encode.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h
+        DESTINATION include/webp)
+install(TARGETS ${INSTALLED_LIBRARIES}
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)

 # Create the CMake version file.
 include(CMakePackageConfigHelpers)
 write_basic_package_version_file(
  "${CMAKE_CURRENT_BINARY_DIR}/WebPConfigVersion.cmake"
-  VERSION ${PACKAGE_VERSION}
+  VERSION ${WEBP_VERSION}
  COMPATIBILITY AnyNewerVersion
 )

@@ -378,7 +313,7 @@ configure_package_config_file(
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/WebPConfig.cmake.in
  ${CMAKE_CURRENT_BINARY_DIR}/WebPConfig.cmake
  INSTALL_DESTINATION ${ConfigPackageLocation}
-  PATH_VARS CMAKE_INSTALL_INCLUDEDIR)
+)

 # Install the generated CMake files.
 install(
--- a/285
+++ b/285
@@ -1,294 +1,9 @@
-f66955de WEBP_REDUCE_CSP: restrict colorspace support
-a289d8e7 update ChangeLog (tag: v0.6.1-rc2)
-c10a493c vwebp: disable double buffering on windows & mac
-0d4466c2 webp_to_sdl.c: fix file mode
-1b27bf8b WEBP_REDUCE_SIZE: disable all rescaler code
-126be109 webpinfo: add -version option
-9add62b5 bump version to 0.6.1
-d3e26144 update NEWS
-2edda639 README: add webpinfo section
-9ca568ef Merge "right-size some tables"
-31f1995c Merge "SSE2 implementation of HasAlphaXXX"
-a80c46bd SSE2 implementation of HasAlphaXXX
-083507f2 right-size some tables
-2e5785b2 anim_utils.c: remove warning when !defined(WEBP_HAVE_GIF)
-b299c47e add WEBP_REDUCE_SIZE
-f593d71a enc: disable pic->stats/extra_info w/WEBP_DISABLE_STATS
-541179a9 Merge "predictor_enc: fix build w/--disable-near-lossless"
-5755a7ec predictor_enc: fix build w/--disable-near-lossless
-eab5bab7 add WEBP_DISABLE_STATS
-8052c585 remove some petty TODOs from vwebp.
-c245343d move LOAD8x4 and STORE8x2 closer to their use location
-b9e734fd dec,cosmetics: normalize function naming style
-c188d546 dec: harmonize function suffixes
-28c5ac81 dec_sse41: harmonize function suffixes
-e65b72a3 Merge "introduce WebPHasAlpha8b and WebPHasAlpha32b"
-b94cee98 dec_sse2: remove HE8uv_SSE2
-44a0ee3f introduce WebPHasAlpha8b and WebPHasAlpha32b
-aebf59ac Merge "WebPPictureAllocARGB: align argb allocation"
-c184665e WebPPictureAllocARGB: align argb allocation
-3daf7509 WebPParseHeaders: remove obsolete animation TODO
-80285d97 cmake: avoid security warnings under msvc
-650eac55 cmake: don't set -Wall with MSVC
-c462cd00 Remove useless code.
-01a98217 Merge "remove WebPWorkerImpl declaration from the header"
-3c49fc47 Merge "thread_utils: fix potentially bad call to Execute"
-fde2782e thread_utils: fix potentially bad call to Execute
-2a270c1d remove WebPWorkerImpl declaration from the header
-f1f437cc remove mention of 'lossy-only parameters' from the doc
-3879074d Merge "WebPMemToUint32: remove ptr cast to int"
-04b029d2 WebPMemToUint32: remove ptr cast to int
-b7971d0e dsp: avoid defining _C functions w/NEON builds
-6ba98764 webpdec: correct alloc size check w/use_argb
-5cfb3b0f normalize include guards
-f433205e Merge changes Ia17c7dfc,I75423abb,Ia2f716b4,I161caa14,I4210081a, ...
-8d033b14 {dec,enc}_neon: harmonize function suffixes x2
-0295e981 upsampling_neon: harmonize function suffixes
-d572c4e5 yuv_neon: harmonize function suffixes
-ab9c2500 rescaler_neon: harmonize function suffixes
-93e0ce27 lossless_neon: harmonize function suffixes
-22fbc50e lossless_enc_neon: harmonize function suffixes
-447875b4 filters_neon,cosmetics: fix indent
-e51bdd43 remove unused VP8TokenToStats() function
-785da7ea enc_neon: harmonize function suffixes
-bc1a251f dec_neon: harmonize function suffixes
-61e535f1 dsp/lossless: workaround gcc-4.8 bug on arm
-68b2eab7 cwebp: fix alpha reporting w/lossless & metadata
-30042faa WebPDemuxGetI: add doc details around WebPFormatFeature
-0a17f471 Merge "WIP: list includes as descendants of the project dir"
-a4399721 WIP: list includes as descendants of the project dir
-08275708 Merge "Make sure we reach the full range for alpha blending."
-d361a6a7 yuv_sse2: harmonize function suffixes
-6921aa6f upsampling_sse2: harmonize function suffixes
-08c67d3e ssim_sse2: harmonize function suffixes
-582a1b57 rescaler_sse2: harmonize function suffixes
-2c1b18ba lossless_sse2: harmonize function suffixes
-0ac46e81 lossless_enc_sse2: harmonize function suffixes
-bc634d57 enc_sse2: harmonize function suffixes
-bcb7347c dec_sse2: harmonize function suffixes
-e14ad93c Make sure we reach the full range for alpha blending.
-7038ca8d demux,StoreFrame: restore hdr size check to min req
-fb3daad6 cpu: fix ssse3 check
-be590e06 Merge "Fix CMake redefinition for HAVE_CPU_FEATURES_H"
-35f736e1 Fix CMake redefinition for HAVE_CPU_FEATURES_H
-a5216efc Fix integer overflow warning.
-a9c8916b decode.h,WebPIDecGetRGB: clarify output ptr validity
-3c74c645 gif2webp: handle 1-frame case properly + fix anim_diff
-c7f295d3 Merge "gif2webp: introduce -loop_compatibility option"
-b4e04677 gif2webp: introduce -loop_compatibility option
-f78da3de add LOCAL_CLANG_PREREQ and avoid WORK_AROUND_GCC w/3.8+
-01c426f1 define WEBP_USE_INTRINSICS w/gcc-4.9+
-8635973d use sdl-config (if available) to determine the link flags
-e9459382 use CPPFLAGS before CFLAGS
-4a9d788e Merge "Android.mk,mips: fix clang build with r15"
-4fbdc9fb Android.mk,mips: fix clang build with r15
-a80fcc4a ifdef code not used by Chrome/Android.
-3993af12 Fix signed integer overflows.
-f66f94ef anim_dump: small tool to dump frames from animated WebP
-6eba857b Merge "rationalize the Makefile.am"
-c5e34fba function definition cleanup
-3822762a rationalize the Makefile.am
-501ef6e4 configure style fix: animdiff -> anim_diff
-f8bdc268 Merge "protect against NULL dump_folder[] value in ReadAnimatedImage()"
-23bfc652 protect against NULL dump_folder[] value in ReadAnimatedImage()
-8dc3d71b cosmetics,ReadAnimatedWebP: correct function comment
-5bd40066 Merge changes I66a64a0a,I4d2e520f
-7945575c cosmetics,webpinfo: remove an else after a return
-8729fa11 cosmetics,cwebp: remove an else after a return
-f324b7f9 cosmetics: normalize fn proto & decl param names
-869eb369 CMake cleanups.
-289e62a3 Remove declaration of unimplemented VP8ApplyNearLosslessPredict
-20a94186 pnmdec,PAM: validate depth before calculating bytes_per_px
-34130afe anim_encode: fix integer overflow
-42c79aa6 Merge "Encoder: harmonize function suffixes"
-b09307dc Encoder: harmonize function suffixes
-bed0456d Merge "SSIM: harmonize the function suffix"
-54f6a3cf lossless_sse2.c: fix some missed suffix changes
-088f1dcc SSIM: harmonize the function suffix
-86fc4dd9 webpdec: use ImgIoUtilCheckSizeArgumentsOverflow
-08ea9ecd imageio: add ability restrict max image size
-6f9daa4a jpegdec,ReadError: fix leaks on error
-a0f72a4f VP8LTransformColorFunc: drop an non-respected 'const' from the signature.
-8c934902 Merge "Lossess dec: harmonize the function suffixes"
-622242aa Lossess dec: harmonize the function suffixes
-1411f027 Lossless Enc: harmonize the function suffixes
-24ad2e3c add const to two variables
-46efe062 Merge "Allow the lossless cruncher to work for alpha."
-8c3f9a47 Speed-up LZ77.
-1aef4c71 Allow the lossless cruncher to work for alpha.
-b8821dbd Improve the box LZ77 speed.
-7beed280 add missing ()s to macro parameters
-6473d20b Merge "fix Android standalone toolchain build"
-dcefed95 Merge "build.gradle: fix arm64 build"
-0c83a8bc Merge "yuv: harmonize suffix naming"
-c6d1db4b fix Android standalone toolchain build
-663a6d9d unify the ALTERNATE_CODE flag usage
-73ea9f27 yuv: harmonize suffix naming
-c71b68ac build.gradle: fix arm64 build
-c4568b47 Rescaler: harmonize the suffix naming
-6cb13b05 Merge "alpha_processing: harmonize the naming suffixes to be _C()"
-83a3e69a Merge "simplify WEBP_EXTERN macro"
-7295fde2 Merge "filters: harmonize the suffixes naming to _SSE2(), _C(), etc."
-8e42ba4c simplify WEBP_EXTERN macro
-331ab34b cost*.c: harmonize the suffix namings
-b161f670 filters: harmonize the suffixes naming to _SSE2(), _C(), etc.
-dec5e4d3 alpha_processing: harmonize the naming suffixes to be _C()
-6878d427 fix memory leak in SDL_Init()
-461ae555 Merge "configure: fix warnings in sdl check"
-62486a22 configure: test for -Wundef
-92982609 dsp.h: fix -Wundef w/__mips_dsp_rev
-0265cede configure: fix warnings in sdl check
-88c73d8a backward_references_enc.h: fix WINDOW_SIZE_BITS check
-4ea49f6b rescaler_sse2.c: fix WEBP_RESCALER_FIX -> _RFIX typo
-1b526638 Clean-up some CMake
-87f57a4b Merge "cmake: fix gif lib detection when cross compiling"
-b34a9db1 cosmetics,dec_sse2: remove some redundant comments
-471c5755 cmake: fix gif lib detection when cross compiling
-c793417a cmake: disable gif2webp if gif lib isn't found
-dcbc1c88 cmake: split gif detection from IMG deps
-66ad84f0 Merge "muxread: remove unreachable code"
-50ec3ab7 muxread: remove unreachable code
-7d67a164 Lossy encoding: smoothen transparent areas to improve compression
-e50650c7 Merge "fix signature for DISABLE_TOKEN_BUFFER compilation"
-671d2567 fix signature for DISABLE_TOKEN_BUFFER compilation
-d6755580 cpu.cmake: use unique flag to test simd disable flags
-28914528 Merge "Remove the argb* files."
-8acb4942 Remove the argb* files.
-3b62347b README: correct cmake invocation note
-7ca0df13 Have the SSE2 version of PackARGB use common code.
-7b250459 Merge "Re-use the transformed image when trying several LZ77 in lossless."
-e132072f Re-use the transformed image when trying several LZ77 in lossless.
-5d7a50ef Get code to compile in C++.
-7b012987 configure: test for -Wparentheses-equality
-f0569adb Fix man pages for multi-threading.
-f1d5a397 multithread cruncher: only copy stats when picture->stats != NULL
-f8c2ac15 Multi-thread the lossless cruncher.
-a88c6522 Merge "Integrate a new LZ77 looking for matches in the neighborhood of a pixel only."
-8f6df1d0 Unroll Predictors 10, 11 and 12.
-355c3d1b Integrate a new LZ77 looking for matches in the neighborhood of a pixel only.
-a1779a01 Refactor LZ77 handling in preparation for a new method.
-67de68b5 Android.mk/build.gradle: fix mips build with clang from r14b
-f209a548 Use the plane code and not the distance when computing statistics.
-b903b80c Split cost-based backward references in its own file.
-498cad34 Cosmetic changes in backward reference.
-e4eb4587 lossless, VP8LTransformColor_C: make sure no overflow happens with colors.
-af6deaff webpinfo: handle alpha flag mismatch
-7caef29b Fix typo that creeped in.
-39e19f92 Merge "near lossless: fix unsigned int overflow warnings."
-9bbc0891 near lossless: fix unsigned int overflow warnings.
-e1118d62 Merge "cosmetics,FindClosestDiscretized: use uint in mask creation"
-186bc9b7 Merge "webpinfo: tolerate ALPH+VP8L"
-b5887297 cosmetics,FindClosestDiscretized: use uint in mask creation
-f1784aee near_lossless,FindClosestDiscretized: use unsigned ops
-0d20abb3 webpinfo: tolerate ALPH+VP8L
-972104b3 webpmux: tolerate false positive Alpha flag
-dd7e83cc tiffdec,ReadTIFF: ensure data_size is < tsize_t max
-d988eb7b tiffdec,MyRead: quiet -Wshorten-64-to-32 warning
-dabda707 webpinfo: add support to parse Alpha bitstream
-4c117643 webpinfo: correct background color output, BGRA->ARGB
-defc98d7 Doc: clarify the role of quality in WebPConfig.
-d78ff780 Merge "Fix code to compile with C++."
-c8f14093 Fix code to compile with C++.
-497dc6a7 pnmdec: sanitize invalid header output
-d78e5867 Merge "configure: test for -Wconstant-conversion"
-481e91eb Merge "pnmdec,PAM: set bytes_per_px based on depth when missing"
-93b12753 configure: test for -Wconstant-conversion
-645f0c53 pnmdec,PAM: set bytes_per_px based on depth when missing
-e9154605 Merge "vwebp: activate GLUT double-buffering"
-818d795b vwebp: activate GLUT double-buffering
-d63e6f4b Add a man page for webpinfo
-4d708435 Merge "NEON: implement ConvertRGB24ToY/BGR24/ARGB/RGBA32ToUV/ARGBToUV"
-faf42213 NEON: implement ConvertRGB24ToY/BGR24/ARGB/RGBA32ToUV/ARGBToUV
-b4d576fa Install man pages with CMake.
-cbc1b921 webpinfo: add features to parse bitstream header
-e644c556 Fix bad bit writer initialization.
-b62cdad2 Merge "Implement a cruncher for lossless at method 6."
-da3e4dfb use the exact constant for the gamma transfer function
-a9c701e0 Merge "tiffdec: fix EXTRASAMPLES check"
-adab8ce0 Implement a cruncher for lossless at method 6.
-1b92b237 Merge "Fix VP8ApplyNearLossless to respect const and stride."
-1923ff02 tiffdec: fix EXTRASAMPLES check
-97cce5ba tiffdec: only request EXTRASAMPLES w/> 3 samples/px
-0dcd85b6 Fix VP8ApplyNearLossless to respect const and stride.
-f7682189 yuv: rationalize the C/SSE2 function naming
-52245424 NEON implementation of some Sharp-YUV420 functions
-690efd82 Avoid several backward reference copies.
-4bb1f607 src/dec/vp8_dec.h, cosmetics: fix comments
-285748be cmake: build/install webpinfo
-78fd199c backward_references_enc.c: clear -Wshadow warnings
-ae836410 WebPLog2FloorC: clear -Wshadow warning
-d0b7404e Merge "WASM support"
-134e314f WASM support
-c08adb6f Merge "VP8LEnc: remove use of BitsLog2Ceiling()"
-28c37ebd VP8LEnc: remove use of BitsLog2Ceiling()
-2cb58ab2 webpinfo: output format as a human readable string
-bb175a93 Merge "rename some symbols clashing with MSVC headers"
-39eda658 Remove a duplicated pixel hash implementation.
-36b8274d rename some symbols clashing with MSVC headers
-274daf54 Add webpinfo tool.
-ec5036e4 add explicit reference to /usr/local/{lib,inc}
-18f0dfac Merge "fix TIFF encoder regarding rgbA/RGBA"
-4e2b0b50 Merge "webpdec.h: fix a doc typo"
-e2eeabff Merge "Install binaries, libraries and headers in CMake."
-836607e6 webpdec.h: fix a doc typo
-9273e441 fix TIFF encoder regarding rgbA/RGBA
-17e3c11f Add limited PAM decoding support
-5f624871 Install binaries, libraries and headers in CMake.
-976adac1 Merge "lossless incremental decoding: fix missing eos_ test"
-f8fad4fa lossless incremental decoding: fix missing eos_ test
-27415d41 Merge "vwebp_sdl: fix the makefile.unix"
-49566182 Merge "ImgIoUtilWriteFile(): use ImgIoUtilSetBinaryMode"
-6f75a51b Analyze the transform entropy on the whole image.
-a5e4e3af Use palette only if we can in entropy analysis.
-75a9c3c4 Improve compression by better entropy analysis.
-39cf6f4f vwebp_sdl: fix the makefile.unix
-699b0416 ImgIoUtilWriteFile(): use ImgIoUtilSetBinaryMode
-7d985bd1 Fix small entropy analysis bug.
-6e7caf06 Optimize the color cache size.
-833c9219 More efficient stochastic histogram merge.
-5183326b Refactor the greedy histogram merge.
-99f6f462 Merge "histogram_enc.c,MyRand: s/ul/u/ for unsigned constants"
-80a22186 ssim.c: remove dead include
-a128dfff histogram_enc.c,MyRand: s/ul/u/ for unsigned constants
-693bf74e move the SSIM calculation code in ssim.c / ssim_sse2.c
-10d791ca Merge "Fix the random generator in HistogramCombineStochastic."
-fa63a966 Fix the random generator in HistogramCombineStochastic.
-16be192f VP8LSetBitPos: remove the eos_ setting
-027151ca don't erase the surface before blitting.
-4105d565 disable WEBP_USE_XXX optimisations when EMSCRIPTEN is defined
-9ee32a75 Merge "WebP-JS: emscripten-based Javascript decoder"
-ca9f7b7d WebP-JS: emscripten-based Javascript decoder
-868aa690 Perform greedy histogram merge in a unified way.
-5b393f2d Merge "fix path typo for vwebp_sdl in Makefile.vc"
-e0012bea CMake: only use libwebpdecoder for building dwebp
-84c2a7b0 fix path typo for vwebp_sdl in Makefile.vc
-1b0e4abf Merge "Add a flag to disable SIMD optimizations."
-32263250 Add a flag to disable SIMD optimizations.
-b494fdec optimize the ARGB->ARGB Import to use memcpy
-f1536039 Merge "ReadWebP: decode directly into a pre-allocated buffer"
-e69ed291 ReadWebP: decode directly into a pre-allocated buffer
-57d8de8a Merge "vwebp_sdl: simple viewer based on SDL"
-5cfd4ebc LZ77 interval speedups. Faster, smaller, simpler.
-1e7ad88b PNM header decoder: add some basic numerical validation
-17c7890c Merge "Add a decoder only library for WebP in CMake."
-be733786 Merge "Add clang build fix for MSA"
-03cda0e4 Add a decoder only library for WebP in CMake.
-aa893914 Add clang build fix for MSA
-31a92e97 Merge "imageio: add limited PNM support for reading"
-dcf9d82a imageio: add limited PNM support for reading
-6524fcd6 vwebp_sdl: simple viewer based on SDL
-6cf24a24 get_disto: fix reference file read
-43d472aa Merge tag 'v0.6.0'
-50d1a848 update ChangeLog (tag: v0.6.0, origin/0.6.0, 0.6.0)
 20a7fea0 extras/Makefile.am: fix libwebpextras.la reference
 415f3ffe update ChangeLog (tag: v0.6.0-rc3)
 3c6d1224 update NEWS
 ee4a4141 update AUTHORS
 32ed856f Fix "all|no frames are keyframes" settings.
-1c3190b6 Merge "Fix "all|no frames are keyframes" settings."
 f4dc56fd disable GradientUnfilter_NEON
-4f3e3bbd disable GradientUnfilter_NEON
-2dc0bdca Fix "all|no frames are keyframes" settings.
 0d8e0588 img2webp: treat -loop as a no-op w/single images
 b0450139 ReadImage(): restore size reporting
 0ad3b4ef update ChangeLog (tag: v0.6.0-rc2)
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -29,7 +29,7 @@ PLATFORM_LDFLAGS = /SAFESEH
 NOLOGO     = /nologo
 CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /I. /Isrc $(NOLOGO) /W3 /EHsc /c
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
@@ -155,7 +155,6 @@ CFGSET = TRUE
 !MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE - gif2webp                       - requires libgif & >= VS2013
 !MESSAGE - anim_diff                      - requires libgif & >= VS2013
-!MESSAGE - anim_dump
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
 !MESSAGE   'legacy' will produce a Windows 2000 compatible library.
@@ -234,6 +233,9 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\yuv_sse2.obj \

 DSP_ENC_OBJS = \
+    $(DIROBJ)\dsp\argb.obj \
+    $(DIROBJ)\dsp\argb_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\argb_sse2.obj \
    $(DIROBJ)\dsp\cost.obj \
    $(DIROBJ)\dsp\cost_mips32.obj \
    $(DIROBJ)\dsp\cost_mips_dsp_r2.obj \
@@ -356,15 +358,10 @@ all: ex $(EXTRA_EXAMPLES)
 # C99 support which is only available from VS2013 onward.
 gif2webp: $(DIRBIN)\gif2webp.exe
 anim_diff: $(DIRBIN)\anim_diff.exe
-anim_dump: $(DIRBIN)\anim_dump.exe

 $(DIRBIN)\anim_diff.exe: $(DIROBJ)\examples\anim_diff.obj $(EX_ANIM_UTIL_OBJS)
 $(DIRBIN)\anim_diff.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\anim_diff.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
-$(DIRBIN)\anim_dump.exe: $(DIROBJ)\examples\anim_dump.obj $(EX_ANIM_UTIL_OBJS)
-$(DIRBIN)\anim_dump.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS)
-$(DIRBIN)\anim_dump.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
-$(DIRBIN)\anim_dump.exe: $(IMAGEIO_ENC_OBJS)
 $(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(IMAGEIO_DEC_OBJS)
 $(DIRBIN)\cwebp.exe: $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj $(IMAGEIO_DEC_OBJS)
@@ -447,7 +444,7 @@ $(OUTPUT_DIRS):
 $(DIROBJ)\$(DLLINC):
 	@echo #ifndef WEBP_DLL_H_ > $@
 	@echo #define WEBP_DLL_H_ >> $@
-	@echo #define WEBP_EXTERN __declspec(dllexport) >> $@
+	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
 	@echo #endif  /* WEBP_DLL_H_ */ >> $@

 .SUFFIXES: .c .obj .res .exe
@@ -459,9 +456,6 @@ $(DIROBJ)\dsp\enc_avx2.obj: src\dsp\enc_avx2.c
 $(DIROBJ)\examples\anim_diff.obj: examples\anim_diff.c
 	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
 	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
-$(DIROBJ)\examples\anim_dump.obj: examples\anim_dump.c
-	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
-	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
 $(DIROBJ)\examples\anim_util.obj: examples\anim_util.c
 	$(CC) $(CFLAGS) /DWEBP_HAVE_GIF /Fd$(LIBWEBP_PDBNAME) \
 	  /Fo$(DIROBJ)\examples\ examples\$(@B).c
--- a/13
+++ b/13
@@ -1,16 +1,3 @@
- 11/24/2017: version 0.6.1
-  This is a binary compatible release.
-  * lossless performance and compression improvements + a new 'cruncher' mode
-    (-m 6 -q 100)
-  * ARM performance improvements with clang (15-20% w/ndk r15c, issue #339)
-  * webp-js: emscripten/webassembly based javascript decoder
-  * miscellaneous bug & build fixes (issue #329, #332, #343, #353, #360, #361,
-    #363)
-  Tool updates / additions:
-    added webpinfo - prints file format information (issue #330)
-    gif2webp - loop behavior modified to match Chrome M63+ (crbug.com/649264);
-               '-loop_compatibility' can be used for the old behavior
-
 - 1/26/2017: version 0.6.0
  * lossless performance and compression improvements
  * miscellaneous performance improvements (SSE2, NEON, MSA)
--- a/25
+++ b/25
@@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.6.1
+               \____/____/\_____/_____/____/v0.6.0

 Description:
 ============
@@ -113,8 +113,8 @@ make install

 CMake:
 ------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo
-and the JS bindings.
+With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp and the
+JS bindings.

 Prerequisites:
 A compiler (e.g., gcc with autotools) and CMake.
@@ -367,23 +367,6 @@ Use following options to convert into alternate image formats:
  -quiet ....... quiet mode, don't print anything
  -noasm ....... disable all assembly optimizations

-WebP file analysis tool:
-========================
-
-'webpinfo' can be used to print out the chunk level structure and bitstream
-header information of WebP files. It can also check if the files are of valid
-WebP format.
-
-Usage: webpinfo [options] in_files
-Note: there could be multiple input files;
-      options must come before input files.
-Options:
-  -version ........... Print version number and exit.
-  -quiet ............. Do not show chunk parsing information.
-  -diag .............. Show parsing error diagnosis.
-  -summary ........... Show chunk stats summary.
-  -bitstream_info .... Parse bitstream header.
-
 Visualization tool:
 ===================

@@ -494,8 +477,6 @@ Options:
  -metadata <string> ..... comma separated list of metadata to
                           copy from the input to the output if present
                           Valid values: all, none, icc, xmp (default)
-  -loop_compatibility .... use compatibility mode for Chrome
-                           version prior to M62 (inclusive)
  -mt .................... use multi-threading if available

  -version ............... print version number and exit
--- a/README.mux
+++ b/README.mux
@@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.4.1
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.4.0


 Description:
--- a/README.wasm
+++ b/README.wasm
@@ -0,0 +1,91 @@
+Description:
+============
+
+This file describes the compilation of libwebp using portable intrinsics /
+WebAssembly (wasm) to native targets using clang and CMake.
+
+Prerequisites:
+==============
+
+- cmake 2.8+
+
+- clang 3.9+ for portable intrinsics support; as wasm progresses a tip of tree
+  build may be necessary.
+
+Building:
+=========
+
+ - configure the project with CMake using:
+
+ $ mkdir -p build && \
+   cd build && \
+   cmake -DWEBP_BUILD_DWEBP=1 -DCMAKE_C_COMPILER=clang -DWEBP_ENABLE_WASM=1 ../
+
+ - compile dwebp using 'make'.
+
+ - Note this currently generates native executables only and is incompatible
+   with -DWEBP_BUILD_WEBP_JS.
+
+Build options:
+==============
+
+- platform specific multiply high (mulhi) implementation, disabled by default.
+  arm: -DCMAKE_C_FLAGS='-DENABLE_NEON_BUILTIN_MULHI_INT16X8 ...'
+  x86: -DCMAKE_C_FLAGS='-DENABLE_X86_BUILTIN_MULHI_INT16X8 ...'
+
+Cross compilation:
+==================
+
+ - arm toolchains can be obtained from:
+   http://www.linaro.org/downloads/
+
+ - the android ndk can be obtained from:
+   https://developer.android.com/ndk/downloads/index.html
+
+armv7:
+------
+
+Android:
+ $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
+   --arch arm --api 24 --stl gnustl --install-dir /opt/android-arm-24
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp \
+   -DWEBP_BUILD_DWEBP=1 \
+   -DCMAKE_C_COMPILER=/opt/android-arm-24/bin/clang \
+   -DCMAKE_PREFIX_PATH=/opt/android-arm-24/sysroot/usr/lib \
+   -DCMAKE_C_FLAGS=-fPIE \
+   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
+   -DCMAKE_BUILD_TYPE=Release \
+   -DWEBP_ENABLE_WASM=1
+
+Linux:
+ $ gcc_arm=/opt/gcc-arm; target=arm-linux-gnueabihf
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
+   -DCMAKE_C_COMPILER=clang \
+   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc -march=armv7-a -mfpu=neon" \
+   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
+
+aarch64 / arm64:
+----------------
+
+Android:
+ $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
+   --arch arm64 --api 24 --stl gnustl --install-dir /opt/android-arm64-24
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp \
+   -DWEBP_BUILD_DWEBP=1 \
+   -DCMAKE_C_COMPILER=/opt/android-arm64-24/bin/clang \
+   -DCMAKE_PREFIX_PATH=/opt/android-arm64-24/sysroot/usr/lib \
+   -DCMAKE_C_FLAGS=-fPIE \
+   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
+   -DCMAKE_BUILD_TYPE=Release \
+   -DWEBP_ENABLE_WASM=1
+
+Linux:
+ $ gcc_arm=/opt/gcc-aarch64; target=aarch64-linux-gnu
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
+   -DCMAKE_C_COMPILER=clang \
+   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc" \
+   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
--- a/README.webp_js
+++ b/README.webp_js
@@ -31,6 +31,11 @@ using Emscripten and CMake.
 - that's it! Upon completion, you should have the webp.js and
   webp.js.mem files generated.

+ - Note this generates both webp_js and webp_wasm without any SIMD enabled due
+   to bugs with this toolchain associated with the SSE2 code.
+   -DWEBP_ENABLE_WASM is currently meant to generate native (x86, arm)
+   executables (dwebp, cwebp) and is incompatible with -DWEBP_BUILD_WEBP_JS.
+
 The callable JavaScript function is WebPToSDL(), which decodes a raw WebP
 bitstream into a canvas. See webp_js/index.html for a simple usage sample.

--- a/build.gradle
+++ b/build.gradle
@@ -82,14 +82,12 @@ model {
        }
      }
      // Check for NEON usage.
-      if (getTargetPlatform() == "arm") {
+      if (getTargetPlatform() == "arm" || getTargetPlatform() == "arm64") {
        NEON = "c.neon"
        cCompiler.define "HAVE_CPU_FEATURES_H"
      } else {
        NEON = "c"
      }
-
-      cCompiler.args "-I" + file(".").absolutePath
    }
    // Link to pthread for shared libraries.
    withType(SharedLibraryBinarySpec) {
@@ -122,6 +120,9 @@ model {
            include "alpha_processing_neon.$NEON"
            include "alpha_processing_sse2.c"
            include "alpha_processing_sse41.c"
+            include "argb.c"
+            include "argb_mips_dsp_r2.c"
+            include "argb_sse2.c"
            include "cpu.c"
            include "dec.c"
            include "dec_clip_tables.c"
--- a/cmake/WebPConfig.cmake.in
+++ b/cmake/WebPConfig.cmake.in
@@ -1,19 +1,6 @@
-set(WebP_VERSION @PROJECT_VERSION@)
-set(WEBP_VERSION ${WebP_VERSION})
-
@PACKAGE_INIT@

-if(@WEBP_USE_THREAD@)
-  include(CMakeFindDependencyMacro)
-  find_dependency(Threads REQUIRED)
-endif()
-
-include("${CMAKE_CURRENT_LIST_DIR}/WebPTargets.cmake")
-
-set_and_check(WebP_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
-set(WebP_INCLUDE_DIRS ${WebP_INCLUDE_DIR})
-set(WEBP_INCLUDE_DIRS ${WebP_INCLUDE_DIR})
+set(WebP_INCLUDE_DIRS "webp")
+set(WEBP_INCLUDE_DIRS ${WebP_INCLUDE_DIRS})
 set(WebP_LIBRARIES "@INSTALLED_LIBRARIES@")
 set(WEBP_LIBRARIES "${WebP_LIBRARIES}")
-
-check_required_components(WebP)
--- a/cmake/config.h.cmake
+++ b/cmake/config.h.cmake
@@ -70,43 +70,18 @@ foreach(I_LIB PNG JPEG TIFF)
  set(WEBP_HAVE_${I_LIB} ${${I_LIB}_FOUND})
  if(${I_LIB}_FOUND)
    list(APPEND WEBP_DEP_IMG_LIBRARIES ${${I_LIB}_LIBRARIES})
-    list(APPEND WEBP_DEP_IMG_INCLUDE_DIRS
-         ${${I_LIB}_INCLUDE_DIR} ${${I_LIB}_INCLUDE_DIRS})
+    list(APPEND WEBP_DEP_IMG_INCLUDE_DIRS ${${I_LIB}_INCLUDE_DIRS})
  endif()
 endforeach()
-if(WEBP_DEP_IMG_INCLUDE_DIRS)
-  list(REMOVE_DUPLICATES WEBP_DEP_IMG_INCLUDE_DIRS)
-endif()

 # GIF detection, gifdec isn't part of the imageio lib.
-include(CMakePushCheckState)
 set(WEBP_DEP_GIF_LIBRARIES)
 set(WEBP_DEP_GIF_INCLUDE_DIRS)
 find_package(GIF)
 set(WEBP_HAVE_GIF ${GIF_FOUND})
 if(GIF_FOUND)
-  # GIF find_package only locates the header and library, it doesn't fail
-  # compile tests when detecting the version, but falls back to 3 (as of at
-  # least cmake 3.7.2). Make sure the library links to avoid incorrect
-  # detection when cross compiling.
-  cmake_push_check_state()
-  set(CMAKE_REQUIRED_LIBRARIES ${GIF_LIBRARIES})
-  set(CMAKE_REQUIRED_INCLUDES ${GIF_INCLUDE_DIR})
-  check_c_source_compiles("
-      #include <gif_lib.h>
-      int main(void) {
-        (void)DGifOpenFileHandle;
-        return 0;
-      }
-      " GIF_COMPILES
-  )
-  cmake_pop_check_state()
-  if(GIF_COMPILES)
-    list(APPEND WEBP_DEP_GIF_LIBRARIES ${GIF_LIBRARIES})
-    list(APPEND WEBP_DEP_GIF_INCLUDE_DIRS ${GIF_INCLUDE_DIR})
-  else()
-    unset(GIF_FOUND)
-  endif()
+  list(APPEND WEBP_DEP_GIF_LIBRARIES ${GIF_LIBRARIES})
+  list(APPEND WEBP_DEP_GIF_INCLUDE_DIRS ${GIF_INCLUDE_DIR})
 endif()

 ## Check for specific headers.
@@ -164,3 +139,13 @@ strip_bracket(PACKAGE_URL)
 set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
 set(PACKAGE_TARNAME ${PACKAGE_NAME})
 set(VERSION ${PACKAGE_VERSION})
+
+## Generate the config.h header.
+configure_file(${CMAKE_CURRENT_LIST_DIR}/config.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/webp/config.h)
+add_definitions(-DHAVE_CONFIG_H)
+# The webp folder is included as we reference config.h as
+# ../webp/config.h or webp/config.h
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include
+  ${CMAKE_CURRENT_BINARY_DIR}/include/webp
+)
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -13,9 +13,6 @@
 /* Set to 1 if __builtin_bswap64 is available */
 #cmakedefine HAVE_BUILTIN_BSWAP64 1

-/* Define to 1 if you have the <cpu-features.h> header file. */
-#cmakedefine HAVE_CPU_FEATURES_H 1
-
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #cmakedefine HAVE_DLFCN_H 1

@@ -118,19 +115,9 @@
 /* Set to 1 if JPEG library is installed */
 #cmakedefine WEBP_HAVE_JPEG 1

-/* Set to 1 if NEON is supported */
-#cmakedefine WEBP_HAVE_NEON
-
-/* Set to 1 if runtime detection of NEON is enabled */
-/* TODO: handle properly in CMake */
-#cmakedefine WEBP_HAVE_NEON_RTCD
-
 /* Set to 1 if PNG library is installed */
 #cmakedefine WEBP_HAVE_PNG 1

-/* Set to 1 if SDL library is installed */
-#cmakedefine WEBP_HAVE_SDL 1
-
 /* Set to 1 if SSE2 is supported */
 #cmakedefine WEBP_HAVE_SSE2 1

@@ -140,9 +127,6 @@
 /* Set to 1 if TIFF library is installed */
 #cmakedefine WEBP_HAVE_TIFF 1

-/* Enable near lossless encoding */
-#cmakedefine WEBP_NEAR_LOSSLESS 1
-
 /* Undefine this to disable thread support. */
 #cmakedefine WEBP_USE_THREAD 1

--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@@ -1,5 +1,4 @@
 ## Check for SIMD extensions.
-include(CMakePushCheckState)

 function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
  if(NOT ENABLE_SIMD)
@@ -8,8 +7,6 @@ function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
    return()
  endif()
  unset(WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG} CACHE)
-  cmake_push_check_state()
-  set(CMAKE_REQUIRED_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR})
  check_c_source_compiles("
      #include \"${CMAKE_CURRENT_LIST_DIR}/../src/dsp/dsp.h\"
      int main(void) {
@@ -20,7 +17,6 @@ function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
      }
    " WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG}
  )
-  cmake_pop_check_state()
  if(WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG})
    set(WEBP_HAVE_${WEBP_SIMD_FLAG} 1 PARENT_SCOPE)
  else()
@@ -64,7 +60,6 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
  # First try with no extra flag added as the compiler might have default flags
  # (especially on Android).
  unset(WEBP_HAVE_${WEBP_SIMD_FLAG} CACHE)
-  cmake_push_check_state()
  set(CMAKE_REQUIRED_FLAGS)
  webp_check_compiler_flag(${WEBP_SIMD_FLAG} ${WEBP_ENABLE_SIMD})
  if(NOT WEBP_HAVE_${WEBP_SIMD_FLAG})
@@ -90,8 +85,11 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
    foreach(FILE ${SIMD_FILES})
      list(APPEND WEBP_SIMD_FILES_NOT_TO_INCLUDE ${FILE})
    endforeach()
-    # Explicitly disable SIMD.
-    if(SIMD_DISABLE_FLAGS)
+    # Explicitly disable SIMD. Avoid this with WASM to avoid an ICE with clang:
+    # https://bugs.chromium.org/p/webp/issues/detail?id=350
+    # WASM overrides the native SIMD so building it in is harmless aside from
+    # binary size.
+    if(NOT WEBP_ENABLE_WASM AND SIMD_DISABLE_FLAGS)
      list(GET SIMD_DISABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
      include(CheckCCompilerFlag)
      if(SIMD_COMPILE_FLAG)
@@ -106,12 +104,11 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
            set(COMMON_PATTERNS)
          endif()
          set(CMAKE_REQUIRED_DEFINITIONS ${SIMD_COMPILE_FLAG})
-          check_c_source_compiles("int main(void) {return 0;}"
-            FLAG_${SIMD_COMPILE_FLAG}
+          check_c_source_compiles("int main(void) {return 0;}" FLAG2
            FAIL_REGEX "warning: argument unused during compilation:"
            ${COMMON_PATTERNS}
          )
-          if(NOT FLAG_${SIMD_COMPILE_FLAG})
+          if(NOT FLAG2)
            unset(HAS_COMPILE_FLAG CACHE)
          endif()
        endif()
@@ -121,5 +118,14 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
      endif()
    endif()
  endif()
-  cmake_pop_check_state()
 endforeach()
+
+## Add *_wasm.c files if enabled.
+if(WEBP_ENABLE_WASM)
+  file(GLOB SIMD_FILES "${CMAKE_CURRENT_LIST_DIR}/../"
+    "src/dsp/*_wasm.c"
+  )
+  foreach(FILE ${SIMD_FILES})
+    list(APPEND WEBP_SIMD_FILES_TO_INCLUDE ${FILE})
+  endforeach()
+endif()
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([libwebp], [0.6.1],
+AC_INIT([libwebp], [0.6.0],
        [https://bugs.chromium.org/p/webp],,
        [http://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
@@ -79,7 +79,6 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wparentheses-equality])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
-TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wundef])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
@@ -445,12 +444,12 @@ AS_IF([test "x$enable_sdl" != "xno"], [
  CLEAR_LIBVARS([SDL])
  WITHLIB_OPTION([sdl], [SDL])

-  sdl_header="no"
+  $sdl_header = "no";
  LIBCHECK_PROLOGUE([SDL])
  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL_SDL.h"],
                  [AC_CHECK_HEADER([SDL.h], [sdl_header="SDL.h"],
                  [AC_MSG_WARN(SDL library not available - no sdl.h)])])
-  if test x"$sdl_header" != "xno"; then
+  if test x"$sdl_header" != "xno" ; then
    AC_CHECK_LIB(SDL, SDL_Init,
                 [SDL_LIBS="-lSDL"
                  SDL_INCLUDES="-DWEBP_HAVE_SDL"
@@ -459,14 +458,14 @@ AS_IF([test "x$enable_sdl" != "xno"], [
                  sdl_support=yes
                 ],
                 AC_MSG_WARN(Optional SDL library not found),
-                 [$MATH_LIBS])
-    if test x"$sdl_header" = "xSDL.h"; then
+                 [$MATH_LIBS]),
+    if test x"$sdl_header" == "xSDL.h" ; then
      SDL_INCLUDES="$SDL_INCLUDES -DWEBP_HAVE_JUST_SDL_H"
    fi
  fi
  LIBCHECK_EPILOGUE([SDL])

-  if test "$sdl_support" = "yes"; then
+  if test "$sdl_support" = "yes" ; then
    build_vwebp_sdl=yes
  fi
 ])
@@ -590,7 +589,7 @@ AS_IF([test "x$enable_gif" != "xno"], [

  if test "$gif_support" = "yes" -a \
          "$enable_libwebpdemux" = "yes"; then
-    build_anim_diff=yes
+    build_animdiff=yes
  fi

  if test "$gif_support" = "yes" -a \
@@ -598,7 +597,7 @@ AS_IF([test "x$enable_gif" != "xno"], [
    build_gif2webp=yes
  fi
 ])
-AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_anim_diff}" = "yes"])
+AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_animdiff}" = "yes"])
 AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 if test "$enable_libwebpmux" = "yes"; then
@@ -663,7 +662,7 @@ if test "$enable_wic" = "yes"; then
 fi
 esac

-dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP=1
+dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP

 USE_SWAP_16BIT_CSP=""
 AC_MSG_CHECKING(if --enable-swap-16bit-csp option is specified)
@@ -671,7 +670,7 @@ AC_ARG_ENABLE([swap-16bit-csp],
              AS_HELP_STRING([--enable-swap-16bit-csp],
                             [Enable byte swap for 16 bit colorspaces]))
 if test "$enable_swap_16bit_csp" = "yes"; then
-  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP=1"
+  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP"
 fi
 AC_MSG_RESULT(${enable_swap_16bit_csp-no})
 AC_SUBST(USE_SWAP_16BIT_CSP)
@@ -689,21 +688,6 @@ fi
 AC_MSG_RESULT(${enable_experimental-no})
 AC_SUBST(USE_EXPERIMENTAL_CODE)

-dnl === If --disable-near-lossless is defined, add -DWEBP_NEAR_LOSSLESS=0
-
-AC_DEFINE(WEBP_NEAR_LOSSLESS, [1], [Enable near lossless encoding])
-AC_MSG_CHECKING(if --disable-near-lossless option is specified)
-AC_ARG_ENABLE([near_lossless],
-              AS_HELP_STRING([--disable-near-lossless],
-                             [Disable near lossless encoding]),
-              [], [enable_near_lossless=yes])
-if test "$enable_near_lossless" = "no"; then
-  AC_DEFINE(WEBP_NEAR_LOSSLESS, [0], [Enable near lossless encoding])
-  AC_MSG_RESULT([yes])
-else
-  AC_MSG_RESULT([no])
-fi
-
 dnl === Check whether libwebpmux should be built
 AC_MSG_CHECKING(whether libwebpmux is to be built)
 AC_ARG_ENABLE([libwebpmux],
@@ -778,7 +762,7 @@ dwebp : yes
  PNG  : ${png_support-no}
  WIC  : ${wic_support-no}
 GIF support : ${gif_support-no}
-anim_diff   : ${build_anim_diff-no}
+anim_diff   : ${build_animdiff-no}
 gif2webp    : ${build_gif2webp-no}
 img2webp    : ${build_img2webp-no}
 webpmux     : ${enable_libwebpmux-no}
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -2,7 +2,7 @@ AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
 if BUILD_ANIMDIFF
-  noinst_PROGRAMS = anim_diff anim_dump
+  noinst_PROGRAMS = anim_diff
 endif
 if BUILD_GIF2WEBP
  bin_PROGRAMS += gif2webp
@@ -27,36 +27,20 @@ libexample_util_la_LIBADD = ../src/libwebp.la

 anim_diff_SOURCES = anim_diff.c anim_util.c anim_util.h
 anim_diff_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
-anim_diff_LDADD  =
-anim_diff_LDADD += ../src/demux/libwebpdemux.la
-anim_diff_LDADD += libexample_util.la
-anim_diff_LDADD += ../imageio/libimageio_util.la
+anim_diff_LDADD  = ../src/demux/libwebpdemux.la
+anim_diff_LDADD += libexample_util.la ../imageio/libimageio_util.la
 anim_diff_LDADD += $(GIF_LIBS) -lm

-anim_dump_SOURCES = anim_dump.c anim_util.c anim_util.h
-anim_dump_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(PNG_INCLUDES)
-anim_dump_CPPFLAGS += $(GIF_INCLUDES)
-anim_dump_LDADD  =
-anim_dump_LDADD += ../src/demux/libwebpdemux.la
-anim_dump_LDADD += libexample_util.la
-anim_dump_LDADD += ../imageio/libimageio_util.la
-anim_dump_LDADD += ../imageio/libimageenc.la
-anim_dump_LDADD += $(PNG_LIBS) $(GIF_LIBS) $(TIFF_LIBS) -lm
-
 cwebp_SOURCES  = cwebp.c stopwatch.h
 cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-cwebp_LDADD  =
-cwebp_LDADD += libexample_util.la
-cwebp_LDADD += ../imageio/libimageio_util.la
-cwebp_LDADD += ../imageio/libimagedec.la
-cwebp_LDADD += ../src/libwebp.la
+cwebp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+cwebp_LDADD += ../imageio/libimagedec.la ../src/libwebp.la
 cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

 dwebp_SOURCES = dwebp.c stopwatch.h
 dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
-dwebp_LDADD  =
-dwebp_LDADD += libexample_util.la
+dwebp_LDADD  = libexample_util.la
 dwebp_LDADD += ../imageio/libimagedec.la
 dwebp_LDADD += ../imageio/libimageenc.la
 dwebp_LDADD += ../imageio/libimageio_util.la
@@ -65,52 +49,35 @@ dwebp_LDADD +=$(PNG_LIBS) $(JPEG_LIBS)

 gif2webp_SOURCES = gif2webp.c gifdec.c gifdec.h
 gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
-gif2webp_LDADD  =
-gif2webp_LDADD += libexample_util.la
-gif2webp_LDADD += ../imageio/libimageio_util.la
-gif2webp_LDADD += ../src/mux/libwebpmux.la
-gif2webp_LDADD += ../src/libwebp.la
-gif2webp_LDADD += $(GIF_LIBS)
+gif2webp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+gif2webp_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la $(GIF_LIBS)

 vwebp_SOURCES = vwebp.c
 vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
-vwebp_LDADD  =
-vwebp_LDADD += libexample_util.la
-vwebp_LDADD += ../imageio/libimageio_util.la
-vwebp_LDADD += ../src/demux/libwebpdemux.la
-vwebp_LDADD += $(GL_LIBS)
+vwebp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+vwebp_LDADD += ../src/demux/libwebpdemux.la $(GL_LIBS)

 webpmux_SOURCES = webpmux.c
 webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webpmux_LDADD  =
-webpmux_LDADD += libexample_util.la
-webpmux_LDADD += ../imageio/libimageio_util.la
-webpmux_LDADD += ../src/mux/libwebpmux.la
-webpmux_LDADD += ../src/libwebp.la
+webpmux_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+webpmux_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la

 img2webp_SOURCES = img2webp.c
 img2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-img2webp_LDADD  =
-img2webp_LDADD += libexample_util.la
-img2webp_LDADD += ../imageio/libimageio_util.la
+img2webp_LDADD  = libexample_util.la ../imageio/libimageio_util.la
 img2webp_LDADD += ../imageio/libimagedec.la
-img2webp_LDADD += ../src/mux/libwebpmux.la
-img2webp_LDADD += ../src/libwebp.la
+img2webp_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la
 img2webp_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

 webpinfo_SOURCES = webpinfo.c
 webpinfo_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webpinfo_LDADD  =
-webpinfo_LDADD += libexample_util.la
-webpinfo_LDADD += ../imageio/libimageio_util.la
+webpinfo_LDADD  = libexample_util.la ../imageio/libimageio_util.la
 webpinfo_LDADD += ../src/libwebp.la

 if BUILD_LIBWEBPDECODER
  anim_diff_LDADD += ../src/libwebpdecoder.la
-  anim_dump_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
 else
  anim_diff_LDADD += ../src/libwebp.la
-  anim_dump_LDADD += ../src/libwebp.la
  vwebp_LDADD += ../src/libwebp.la
 endif
--- a/examples/anim_diff.c
+++ b/examples/anim_diff.c
@@ -143,18 +143,8 @@ static int CompareAnimatedImagePair(const AnimatedImage* const img1,
  if (!ok) return 0;  // These are fatal failures, can't proceed.

  if (is_multi_frame_image) {  // Checks relevant for multi-frame images only.
-    int max_loop_count_workaround = 0;
-    // Transcodes to webp increase the gif loop count by 1 for compatibility.
-    // When the gif has the maximum value the webp value will be off by one.
-    if ((img1->format == ANIM_GIF && img1->loop_count == 65536 &&
-         img2->format == ANIM_WEBP && img2->loop_count == 65535) ||
-        (img1->format == ANIM_WEBP && img1->loop_count == 65535 &&
-         img2->format == ANIM_GIF && img2->loop_count == 65536)) {
-      max_loop_count_workaround = 1;
-    }
-    ok = (max_loop_count_workaround ||
-          CompareValues(img1->loop_count, img2->loop_count,
-                        "Loop count mismatch")) && ok;
+    ok = CompareValues(img1->loop_count, img2->loop_count,
+                       "Loop count mismatch") && ok;
    ok = CompareBackgroundColor(img1->bgcolor, img2->bgcolor,
                                premultiply) && ok;
  }
--- a/examples/anim_dump.c
+++ b/examples/anim_dump.c
@@ -1,104 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Decodes an animated WebP file and dumps the decoded frames as PNG or TIFF.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <stdio.h>
-#include <string.h>  // for 'strcmp'.
-
-#include "./anim_util.h"
-#include "webp/decode.h"
-#include "../imageio/image_enc.h"
-
-#if defined(_MSC_VER) && _MSC_VER < 1900
-#define snprintf _snprintf
-#endif
-
-static void Help(void) {
-  printf("Usage: anim_dump [options] files...\n");
-  printf("\nOptions:\n");
-  printf("  -folder <string> .... dump folder (default: '.')\n");
-  printf("  -prefix <string> .... prefix for dumped frames "
-                                  "(default: 'dump_')\n");
-  printf("  -tiff ............... save frames as TIFF\n");
-  printf("  -pam ................ save frames as PAM\n");
-}
-
-int main(int argc, const char* argv[]) {
-  int error = 0;
-  const char* dump_folder = ".";
-  const char* prefix = "dump_";
-  const char* suffix = "png";
-  WebPOutputFileFormat format = PNG;
-  int c;
-
-  if (argc < 2) {
-    Help();
-    return -1;
-  }
-
-  for (c = 1; !error && c < argc; ++c) {
-    if (!strcmp(argv[c], "-folder")) {
-      if (c + 1 == argc) {
-        fprintf(stderr, "missing argument after option '%s'\n", argv[c]);
-        error = 1;
-        break;
-      }
-      dump_folder = argv[++c];
-    } else if (!strcmp(argv[c], "-prefix")) {
-      if (c + 1 == argc) {
-        fprintf(stderr, "missing argument after option '%s'\n", argv[c]);
-        error = 1;
-        break;
-      }
-      prefix = argv[++c];
-    } else if (!strcmp(argv[c], "-tiff")) {
-      format = TIFF;
-      suffix = "tiff";
-    } else if (!strcmp(argv[c], "-pam")) {
-      format = PAM;
-      suffix = "pam";
-    } else {
-      uint32_t i;
-      AnimatedImage image;
-      const char* const file = argv[c];
-      memset(&image, 0, sizeof(image));
-      printf("Decoding file: %s as %s/%sxxxx.%s\n",
-             file, dump_folder, prefix, suffix);
-      if (!ReadAnimatedImage(file, &image, 0, NULL)) {
-        fprintf(stderr, "Error decoding file: %s\n Aborting.\n", file);
-        error = 1;
-        break;
-      }
-      for (i = 0; !error && i < image.num_frames; ++i) {
-        char out_file[1024];
-        WebPDecBuffer buffer;
-        WebPInitDecBuffer(&buffer);
-        buffer.colorspace = MODE_RGBA;
-        buffer.is_external_memory = 1;
-        buffer.width = image.canvas_width;
-        buffer.height = image.canvas_height;
-        buffer.u.RGBA.rgba = image.frames[i].rgba;
-        buffer.u.RGBA.stride = buffer.width * sizeof(uint32_t);
-        buffer.u.RGBA.size = buffer.u.RGBA.stride * buffer.height;
-        snprintf(out_file, sizeof(out_file), "%s/%s%.4d.%s",
-                 dump_folder, prefix, i, suffix);
-        if (!WebPSaveImage(&buffer, format, out_file)) {
-          fprintf(stderr, "Error while saving image '%s'\n", out_file);
-          error = 1;
-        }
-        WebPFreeDecBuffer(&buffer);
-      }
-      ClearAnimatedImage(&image);
-    }
-  }
-  return error ? 1 : 0;
-}
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@@ -16,7 +16,7 @@
 #include <stdio.h>
 #include <string.h>

-#if defined(WEBP_HAVE_GIF)
+#ifdef WEBP_HAVE_GIF
 #include <gif_lib.h>
 #endif
 #include "webp/format_constants.h"
@@ -33,13 +33,11 @@ static const int kNumChannels = 4;
 // -----------------------------------------------------------------------------
 // Common utilities.

-#if defined(WEBP_HAVE_GIF)
 // Returns true if the frame covers the full canvas.
 static int IsFullFrame(int width, int height,
                       int canvas_width, int canvas_height) {
  return (width == canvas_width && height == canvas_height);
 }
-#endif // WEBP_HAVE_GIF

 static int CheckSizeForOverflow(uint64_t size) {
  return (size == (size_t)size);
@@ -87,7 +85,6 @@ void ClearAnimatedImage(AnimatedImage* const image) {
  }
 }

-#if defined(WEBP_HAVE_GIF)
 // Clear the canvas to transparent.
 static void ZeroFillCanvas(uint8_t* rgba,
                           uint32_t canvas_width, uint32_t canvas_height) {
@@ -129,7 +126,6 @@ static void CopyFrameRectangle(const uint8_t* src, uint8_t* dst, int stride,
    dst += stride;
  }
 }
-#endif // WEBP_HAVE_GIF

 // Canonicalize all transparent pixels to transparent black to aid comparison.
 static void CleanupTransparentPixels(uint32_t* rgba,
@@ -156,8 +152,6 @@ static int DumpFrame(const char filename[], const char dump_folder[],
  FILE* f = NULL;
  const char* row;

-  if (dump_folder == NULL) dump_folder = ".";
-
  base_name = strrchr(filename, '/');
  base_name = (base_name == NULL) ? filename : base_name + 1;
  max_len = strlen(dump_folder) + 1 + strlen(base_name)
@@ -206,7 +200,7 @@ static int IsWebP(const WebPData* const webp_data) {
  return (WebPGetInfo(webp_data->bytes, webp_data->size, NULL, NULL) != 0);
 }

-// Read animated WebP bitstream 'webp_data' into 'AnimatedImage' struct.
+// Read animated WebP bitstream 'file_str' into 'AnimatedImage' struct.
 static int ReadAnimatedWebP(const char filename[],
                            const WebPData* const webp_data,
                            AnimatedImage* const image, int dump_frames,
@@ -275,7 +269,6 @@ static int ReadAnimatedWebP(const char filename[],
    prev_frame_timestamp = timestamp;
  }
  ok = dump_ok;
-  if (ok) image->format = ANIM_WEBP;

 End:
  WebPAnimDecoderDelete(dec);
@@ -285,7 +278,7 @@ static int ReadAnimatedWebP(const char filename[],
 // -----------------------------------------------------------------------------
 // GIF Decoding.

-#if defined(WEBP_HAVE_GIF)
+#ifdef WEBP_HAVE_GIF

 // Returns true if this is a valid GIF bitstream.
 static int IsGIF(const WebPData* const data) {
@@ -430,11 +423,6 @@ static uint32_t GetBackgroundColorGIF(GifFileType* gif) {
 }

 // Find appropriate app extension and get loop count from the next extension.
-// We use Chrome's interpretation of the 'loop_count' semantics:
-//   if not present -> loop once
-//   if present and loop_count == 0, return 0 ('infinite').
-//   if present and loop_count != 0, it's the number of *extra* loops
-//     so we need to return loop_count + 1 as total loop number.
 static uint32_t GetLoopCountGIF(const GifFileType* const gif) {
  int i;
  for (i = 0; i < gif->ImageCount; ++i) {
@@ -452,13 +440,12 @@ static uint32_t GetLoopCountGIF(const GifFileType* const gif) {
      if (signature_is_ok &&
          eb2->Function == CONTINUE_EXT_FUNC_CODE && eb2->ByteCount >= 3 &&
          eb2->Bytes[0] == 1) {
-        const uint32_t extra_loop = ((uint32_t)(eb2->Bytes[2]) << 8) +
-                                    ((uint32_t)(eb2->Bytes[1]) << 0);
-        return (extra_loop > 0) ? extra_loop + 1 : 0;
+        return ((uint32_t)(eb2->Bytes[2]) << 8) +
+               ((uint32_t)(eb2->Bytes[1]) << 0);
      }
    }
  }
-  return 1;  // Default.
+  return 0;  // Default.
 }

 // Get duration of 'n'th frame in milliseconds.
@@ -685,7 +672,6 @@ static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
      }
    }
  }
-  image->format = ANIM_GIF;
  DGifCloseFile(gif, NULL);
  return 1;
 }
--- a/examples/anim_util.h
+++ b/examples/anim_util.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif

-typedef enum {
-  ANIM_GIF,
-  ANIM_WEBP
-} AnimatedFileFormat;
-
 typedef struct {
  uint8_t* rgba;         // Decoded and reconstructed full frame.
  int duration;          // Frame duration in milliseconds.
@@ -34,7 +29,6 @@ typedef struct {
 } DecodedFrame;

 typedef struct {
-  AnimatedFileFormat format;
  uint32_t canvas_width;
  uint32_t canvas_height;
  uint32_t bgcolor;
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@@ -463,9 +463,8 @@ static int WriteWebPWithMetadata(FILE* const out,
    } else {
      const int is_lossless = !memcmp(webp, "VP8L", kTagSize);
      if (is_lossless) {
-        // Presence of alpha is stored in the 37th bit (29th after the
-        // signature) of VP8L data.
-        if (webp[kChunkHeaderSize + 4] & (1 << 4)) flags |= kAlphaFlag;
+        // Presence of alpha is stored in the 29th bit of VP8L data.
+        if (webp[kChunkHeaderSize + 3] & (1 << 5)) flags |= kAlphaFlag;
      }
      ok = ok && (fwrite(kVP8XHeader, kChunkHeaderSize, 1, out) == 1);
      ok = ok && WriteLE32(out, flags);
@@ -487,10 +486,10 @@ static int WriteWebPWithMetadata(FILE* const out,
      *metadata_written |= METADATA_XMP;
    }
    return ok;
+  } else {
+    // No metadata, just write the original image file.
+    return (fwrite(webp, webp_size, 1, out) == 1);
  }
-
-  // No metadata, just write the original image file.
-  return (fwrite(webp, webp_size, 1, out) == 1);
 }

 //------------------------------------------------------------------------------
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@@ -72,10 +72,8 @@ static void Help(void) {
  printf("  -metadata <string> ..... comma separated list of metadata to\n");
  printf("                           ");
  printf("copy from the input to the output if present\n");
-  printf("                           ");
-  printf("Valid values: all, none, icc, xmp (default)\n");
-  printf("  -loop_compatibility .... use compatibility mode for Chrome\n");
-  printf("                           version prior to M62 (inclusive)\n");
+  printf("                           "
+         "Valid values: all, none, icc, xmp (default)\n");
  printf("  -mt .................... use multi-threading if available\n");
  printf("\n");
  printf("  -version ............... print version number and exit\n");
@@ -106,7 +104,7 @@ int main(int argc, const char *argv[]) {
  WebPAnimEncoderOptions enc_options;
  WebPConfig config;

-  int frame_number = 0;     // Whether we are processing the first frame.
+  int is_first_frame = 1;     // Whether we are processing the first frame.
  int done;
  int c;
  int quiet = 0;
@@ -117,9 +115,8 @@ int main(int argc, const char *argv[]) {
  int stored_icc = 0;         // Whether we have already stored an ICC profile.
  WebPData xmp_data;
  int stored_xmp = 0;         // Whether we have already stored an XMP profile.
-  int loop_count = 0;         // default: infinite
+  int loop_count = 0;
  int stored_loop_count = 0;  // Whether we have found an explicit loop count.
-  int loop_compatibility = 0;
  WebPMux* mux = NULL;

  int default_kmin = 1;  // Whether to use default kmin value.
@@ -154,8 +151,6 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-mixed")) {
      enc_options.allow_mixed = 1;
      config.lossless = 0;
-    } else if (!strcmp(argv[c], "-loop_compatibility")) {
-      loop_compatibility = 1;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
@@ -282,7 +277,7 @@ int main(int argc, const char *argv[]) {

        if (!DGifGetImageDesc(gif)) goto End;

-        if (frame_number == 0) {
+        if (is_first_frame) {
          if (verbose) {
            printf("Canvas screen: %d x %d\n", gif->SWidth, gif->SHeight);
          }
@@ -324,6 +319,7 @@ int main(int argc, const char *argv[]) {
                    "a memory error.\n");
            goto End;
          }
+          is_first_frame = 0;
        }

        // Some even more broken GIF can have sub-rect with zero width/height.
@@ -340,11 +336,7 @@ int main(int argc, const char *argv[]) {
        GIFBlendFrames(&frame, &gif_rect, &curr_canvas);

        if (!WebPAnimEncoderAdd(enc, &curr_canvas, frame_timestamp, &config)) {
-          fprintf(stderr, "Error while adding frame #%d: %s\n", frame_number,
-                  WebPAnimEncoderGetError(enc));
-          goto End;
-        } else {
-          ++frame_number;
+          fprintf(stderr, "%s\n", WebPAnimEncoderGetError(enc));
        }

        // Update canvases.
@@ -394,7 +386,7 @@ int main(int argc, const char *argv[]) {
              if (verbose) {
                fprintf(stderr, "Loop count: %d\n", loop_count);
              }
-              stored_loop_count = loop_compatibility ? (loop_count != 0) : 1;
+              stored_loop_count = (loop_count != 0);
            } else {  // An extension containing metadata.
              // We only store the first encountered chunk of each type, and
              // only if requested by the user.
@@ -451,23 +443,6 @@ int main(int argc, const char *argv[]) {
    goto End;
  }

-  if (!loop_compatibility) {
-    if (!stored_loop_count) {
-      // if no loop-count element is seen, the default is '1' (loop-once)
-      // and we need to signal it explicitly in WebP. Note however that
-      // in case there's a single frame, we still don't need to store it.
-      if (frame_number > 1) {
-        stored_loop_count = 1;
-        loop_count = 1;
-      }
-    } else if (loop_count > 0 && loop_count < 65535) {
-      // adapt GIF's semantic to WebP's (except in the infinite-loop case)
-      loop_count += 1;
-    }
-  }
-  // loop_count of 0 is the default (infinite), so no need to signal it
-  if (loop_count == 0) stored_loop_count = 0;
-
  if (stored_loop_count || stored_icc || stored_xmp) {
    // Re-mux to add loop count and/or metadata as needed.
    mux = WebPMuxCreate(&webp_data, 1);
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@@ -248,9 +248,9 @@ static void HandleKey(unsigned char key, int pos_x, int pos_y) {
      }
    }
  } else if (key == 'i') {
-    // Note: doesn't handle refresh of animation's last-frame (it's quite
-    // more involved to do, since you need to save the previous frame).
    kParams.print_info = 1 - kParams.print_info;
+    // TODO(skal): handle refresh of animation's last-frame too. It's quite
+    // more involved though (need to save the previous frame).
    if (!kParams.has_animation) ClearPreviousFrame();
    glutPostRedisplay();
  } else if (key == 'd') {
@@ -260,8 +260,8 @@ static void HandleKey(unsigned char key, int pos_x, int pos_y) {
 }

 static void HandleReshape(int width, int height) {
-  // Note: reshape doesn't preserve aspect ratio, and might
-  // be handling larger-than-screen pictures incorrectly.
+  // TODO(skal): should we preserve aspect ratio?
+  // Also: handle larger-than-screen pictures correctly.
  glViewport(0, 0, width, height);
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
@@ -378,23 +378,13 @@ static void HandleDisplay(void) {
    }
  }
  glPopMatrix();
-#if defined(__APPLE__) || defined(_WIN32)
-  glFlush();
-#else
  glutSwapBuffers();
-#endif
 }

 static void StartDisplay(void) {
  const int width = kParams.canvas_width;
  const int height = kParams.canvas_height;
-  // TODO(webp:365) GLUT_DOUBLE results in flickering / old frames to be
-  // partially displayed with animated webp + alpha.
-#if defined(__APPLE__) || defined(_WIN32)
-  glutInitDisplayMode(GLUT_RGBA);
-#else
  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
-#endif
  glutInitWindowSize(width, height);
  glutCreateWindow("WebP viewer");
  glutDisplayFunc(HandleDisplay);
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
@@ -233,20 +233,20 @@ static int GetSignedBits(const uint8_t* const data, size_t data_size, size_t nb,
  return 1;
 }

-#define GET_BITS(v, n)                                 \
-  do {                                                 \
-    if (!GetBits(data, data_size, n, &(v), bit_pos)) { \
-      LOG_ERROR("Truncated lossy bitstream.");         \
-      return WEBP_INFO_TRUNCATED_DATA;                 \
-    }                                                  \
+#define GET_BITS(v, n)                               \
+  do {                                               \
+    if (!GetBits(data, data_size, n, &v, bit_pos)) { \
+      LOG_ERROR("Truncated lossy bitstream.");       \
+      return WEBP_INFO_TRUNCATED_DATA;               \
+    }                                                \
  } while (0)

-#define GET_SIGNED_BITS(v, n)                                \
-  do {                                                       \
-    if (!GetSignedBits(data, data_size, n, &(v), bit_pos)) { \
-      LOG_ERROR("Truncated lossy bitstream.");               \
-      return WEBP_INFO_TRUNCATED_DATA;                       \
-    }                                                        \
+#define GET_SIGNED_BITS(v, n)                              \
+  do {                                                     \
+    if (!GetSignedBits(data, data_size, n, &v, bit_pos)) { \
+      LOG_ERROR("Truncated lossy bitstream.");             \
+      return WEBP_INFO_TRUNCATED_DATA;                     \
+    }                                                      \
  } while (0)

 static WebPInfoStatus ParseLossySegmentHeader(const WebPInfo* const webp_info,
@@ -462,12 +462,12 @@ static int LLGetBits(const uint8_t* const data, size_t data_size, size_t nb,
  return 1;
 }

-#define LL_GET_BITS(v, n)                                \
-  do {                                                   \
-    if (!LLGetBits(data, data_size, n, &(v), bit_pos)) { \
-      LOG_ERROR("Truncated lossless bitstream.");        \
-      return WEBP_INFO_TRUNCATED_DATA;                   \
-    }                                                    \
+#define LL_GET_BITS(v, n)                              \
+  do {                                                 \
+    if (!LLGetBits(data, data_size, n, &v, bit_pos)) { \
+      LOG_ERROR("Truncated lossless bitstream.");      \
+      return WEBP_INFO_TRUNCATED_DATA;                 \
+    }                                                  \
  } while (0)

 static WebPInfoStatus ParseLosslessTransform(WebPInfo* const webp_info,
@@ -817,8 +817,9 @@ static WebPInfoStatus ProcessImageChunk(const ChunkData* const chunk_data,
    if (webp_info->seen_image_subchunk_) {
      LOG_ERROR("Consecutive VP8/VP8L sub-chunks in an ANMF chunk.");
      return WEBP_INFO_PARSE_ERROR;
+    } else {
+      webp_info->seen_image_subchunk_ = 1;
    }
-    webp_info->seen_image_subchunk_ = 1;
  } else {
    if (webp_info->chunk_counts_[CHUNK_VP8] ||
        webp_info->chunk_counts_[CHUNK_VP8L]) {
@@ -872,9 +873,9 @@ static WebPInfoStatus ProcessALPHChunk(const ChunkData* const chunk_data,
    if (webp_info->seen_alpha_subchunk_) {
      LOG_ERROR("Consecutive ALPH sub-chunks in an ANMF chunk.");
      return WEBP_INFO_PARSE_ERROR;
+    } else {
+      webp_info->seen_alpha_subchunk_ = 1;
    }
-    webp_info->seen_alpha_subchunk_ = 1;
-
    if (webp_info->seen_image_subchunk_) {
      LOG_ERROR("ALPHA sub-chunk detected after VP8 sub-chunk "
                "in an ANMF chunk.");
@@ -1106,7 +1107,6 @@ static void HelpLong(void) {
         "Note: there could be multiple input files;\n"
         "      options must come before input files.\n"
         "Options:\n"
-         "  -version ........... Print version number and exit.\n"
         "  -quiet ............. Do not show chunk parsing information.\n"
         "  -diag .............. Show parsing error diagnosis.\n"
         "  -summary ........... Show chunk stats summary.\n"
@@ -1140,11 +1140,6 @@ int main(int argc, const char* argv[]) {
      show_summary = 1;
    } else if (!strcmp(argv[c], "-bitstream_info")) {
      parse_bitstream = 1;
-    } else if (!strcmp(argv[c], "-version")) {
-      const int version = WebPGetDecoderVersion();
-      printf("WebP Decoder version: %d.%d.%d\n",
-             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
-      return 0;
    } else {  // Assume the remaining are all input files.
      break;
    }
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@@ -1,4 +1,3 @@
-AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
 AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
 noinst_LTLIBRARIES = libwebpextras.la

@@ -20,22 +19,18 @@ endif

 get_disto_SOURCES  = get_disto.c
 get_disto_CPPFLAGS = $(AM_CPPFLAGS)
-get_disto_LDADD =
-get_disto_LDADD += ../imageio/libimageio_util.la
-get_disto_LDADD += ../imageio/libimagedec.la
+get_disto_LDADD = ../imageio/libimageio_util.la ../imageio/libimagedec.la
 get_disto_LDADD += ../src/libwebp.la
 get_disto_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

 webp_quality_SOURCES  = webp_quality.c
 webp_quality_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webp_quality_LDADD =
-webp_quality_LDADD += ../imageio/libimageio_util.la
+webp_quality_LDADD  = ../imageio/libimageio_util.la
 webp_quality_LDADD += libwebpextras.la
 webp_quality_LDADD += ../src/libwebp.la

 vwebp_sdl_SOURCES  = vwebp_sdl.c webp_to_sdl.c webp_to_sdl.h
 vwebp_sdl_CPPFLAGS = $(AM_CPPFLAGS) $(SDL_INCLUDES)
-vwebp_sdl_LDADD =
-vwebp_sdl_LDADD += ../imageio/libimageio_util.la
+vwebp_sdl_LDADD = ../imageio/libimageio_util.la
 vwebp_sdl_LDADD += ../src/libwebp.la
 vwebp_sdl_LDADD += $(SDL_LIBS)
--- a/extras/extras.c
+++ b/extras/extras.c
@@ -10,7 +10,7 @@
 //  Additional WebP utilities.
 //

-#include "extras/extras.h"
+#include "./extras.h"
 #include "webp/format_constants.h"

 #include <assert.h>
@@ -18,7 +18,7 @@

 #define XTRA_MAJ_VERSION 0
 #define XTRA_MIN_VERSION 1
-#define XTRA_REV_VERSION 1
+#define XTRA_REV_VERSION 0

 //------------------------------------------------------------------------------

--- a/extras/extras.h
+++ b/extras/extras.h
@@ -25,28 +25,28 @@ extern "C" {

 // Returns the version number of the extras library, packed in hexadecimal using
 // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN int WebPGetExtrasVersion(void);
+WEBP_EXTERN(int) WebPGetExtrasVersion(void);

 //------------------------------------------------------------------------------
 // Ad-hoc colorspace importers.

 // Import luma sample (gray scale image) into 'picture'. The 'picture'
 // width and height must be set prior to calling this function.
-WEBP_EXTERN int WebPImportGray(const uint8_t* gray, WebPPicture* picture);
+WEBP_EXTERN(int) WebPImportGray(const uint8_t* gray, WebPPicture* picture);

 // Import rgb sample in RGB565 packed format into 'picture'. The 'picture'
 // width and height must be set prior to calling this function.
-WEBP_EXTERN int WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);
+WEBP_EXTERN(int) WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);

 // Import rgb sample in RGB4444 packed format into 'picture'. The 'picture'
 // width and height must be set prior to calling this function.
-WEBP_EXTERN int WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);
+WEBP_EXTERN(int) WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);

 // Import a color mapped image. The number of colors is less or equal to
 // MAX_PALETTE_SIZE. 'pic' must have been initialized. Its content, if any,
 // will be discarded. Returns 'false' in case of error, or if indexed[] contains
 // invalid indices.
-WEBP_EXTERN int
+WEBP_EXTERN(int)
 WebPImportColorMappedARGB(const uint8_t* indexed, int indexed_stride,
                          const uint32_t palette[], int palette_size,
                          WebPPicture* pic);
@@ -59,7 +59,7 @@ WebPImportColorMappedARGB(const uint8_t* indexed, int indexed_stride,
 // Otherwise (lossy bitstream), the returned value is in the range [0..100].
 // Any error (invalid bitstream, animated WebP, incomplete header, etc.)
 // will return a value of -1.
-WEBP_EXTERN int VP8EstimateQuality(const uint8_t* const data, size_t size);
+WEBP_EXTERN(int) VP8EstimateQuality(const uint8_t* const data, size_t size);

 //------------------------------------------------------------------------------

--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@@ -24,8 +24,8 @@
 #include <string.h>

 #include "webp/encode.h"
-#include "imageio/image_dec.h"
-#include "imageio/imageio_util.h"
+#include "../imageio/image_dec.h"
+#include "../imageio/imageio_util.h"

 static size_t ReadPicture(const char* const filename, WebPPicture* const pic,
                          int keep_alpha) {
@@ -322,7 +322,6 @@ int main(int argc, const char *argv[]) {
      fprintf(stderr, "Can only compute the difference map in ARGB format.\n");
      goto End;
    }
-#if !defined(WEBP_REDUCE_CSP)
    data_size = WebPEncodeLosslessBGRA((const uint8_t*)pic1.argb,
                                       pic1.width, pic1.height,
                                       pic1.argb_stride * 4,
@@ -334,12 +333,6 @@ int main(int argc, const char *argv[]) {
    ret = ImgIoUtilWriteFile(output, data, data_size) ? 0 : 1;
    WebPFree(data);
    if (ret) goto End;
-#else
-    (void)data;
-    (void)data_size;
-    fprintf(stderr, "Cannot save the difference map. Please recompile "
-                    "without the WEBP_REDUCE_CSP flag.\n");
-#endif  // WEBP_REDUCE_CSP
  }
  ret = 0;

--- a/extras/quality_estimate.c
+++ b/extras/quality_estimate.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "extras/extras.h"
+#include "./extras.h"
 #include "webp/decode.h"

 #include <math.h>
--- a/extras/vwebp_sdl.c
+++ b/extras/vwebp_sdl.c
@@ -24,7 +24,7 @@

 #include "webp_to_sdl.h"
 #include "webp/decode.h"
-#include "imageio/imageio_util.h"
+#include "../imageio/imageio_util.h"

 #if defined(WEBP_HAVE_JUST_SDL_H)
 #include <SDL.h>
--- a/extras/webp_quality.c
+++ b/extras/webp_quality.c
@@ -11,8 +11,8 @@
 #include <stdlib.h>
 #include <string.h>

-#include "extras/extras.h"
-#include "imageio/imageio_util.h"
+#include "./extras.h"
+#include "../imageio/imageio_util.h"

 int main(int argc, const char *argv[]) {
  int c;
--- a/extras/webp_to_sdl.c
+++ b/extras/webp_to_sdl.c
@@ -28,7 +28,6 @@
 #include <SDL/SDL.h>
 #endif

-static int init_ok = 0;
 int WebpToSDL(const char* data, unsigned int data_size) {
  int ok = 0;
  VP8StatusCode status;
@@ -43,10 +42,7 @@ int WebpToSDL(const char* data, unsigned int data_size) {
    return 1;
  }

-  if (!init_ok) {
-    SDL_Init(SDL_INIT_VIDEO);
-    init_ok = 1;
-  }
+  SDL_Init(SDL_INIT_VIDEO);

  status = WebPGetFeatures((uint8_t*)data, (size_t)data_size, &config.input);
  if (status != VP8_STATUS_OK) goto Error;
@@ -101,7 +97,6 @@ int WebpToSDL(const char* data, unsigned int data_size) {
 Error:
  SDL_FreeSurface(surface);
  SDL_FreeSurface(screen);
-  WebPFreeDecBuffer(output);
  return ok;
 }

--- a/imageio/Makefile.am
+++ b/imageio/Makefile.am
@@ -1,18 +1,13 @@
 AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
-noinst_LTLIBRARIES =
-noinst_LTLIBRARIES += libimageio_util.la
-noinst_LTLIBRARIES += libimagedec.la
-noinst_LTLIBRARIES += libimageenc.la
+noinst_LTLIBRARIES = libimageio_util.la libimagedec.la libimageenc.la

 noinst_HEADERS =
 noinst_HEADERS += ../src/webp/decode.h
 noinst_HEADERS += ../src/webp/types.h

-libimageio_util_la_SOURCES =
-libimageio_util_la_SOURCES += imageio_util.c imageio_util.h
+libimageio_util_la_SOURCES = imageio_util.c imageio_util.h

-libimagedec_la_SOURCES  =
-libimagedec_la_SOURCES += image_dec.c image_dec.h
+libimagedec_la_SOURCES  = image_dec.c image_dec.h
 libimagedec_la_SOURCES += jpegdec.c jpegdec.h
 libimagedec_la_SOURCES += metadata.c metadata.h
 libimagedec_la_SOURCES += pngdec.c pngdec.h
@@ -23,7 +18,6 @@ libimagedec_la_SOURCES += wicdec.c wicdec.h
 libimagedec_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 libimagedec_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)

-libimageenc_la_SOURCES  =
-libimageenc_la_SOURCES += image_enc.c image_enc.h
+libimageenc_la_SOURCES  = image_enc.c image_enc.h
 libimageenc_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 libimageenc_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@@ -542,24 +542,22 @@ int WebPWriteYUV(FILE* fout, const WebPDecBuffer* const buffer) {
 // Generic top-level call

 int WebPSaveImage(const WebPDecBuffer* const buffer,
-                  WebPOutputFileFormat format,
-                  const char* const out_file_name) {
+                  WebPOutputFileFormat format, const char* const out_file) {
  FILE* fout = NULL;
  int needs_open_file = 1;
-  const int use_stdout = (out_file_name != NULL) && !strcmp(out_file_name, "-");
+  const int use_stdout = (out_file != NULL) && !strcmp(out_file, "-");
  int ok = 1;

-  if (buffer == NULL || out_file_name == NULL) return 0;
+  if (buffer == NULL || out_file == NULL) return 0;

 #ifdef HAVE_WINCODEC_H
  needs_open_file = (format != PNG);
 #endif

  if (needs_open_file) {
-    fout = use_stdout ? ImgIoUtilSetBinaryMode(stdout)
-                      : fopen(out_file_name, "wb");
+    fout = use_stdout ? ImgIoUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
    if (fout == NULL) {
-      fprintf(stderr, "Error opening output file %s\n", out_file_name);
+      fprintf(stderr, "Error opening output file %s\n", out_file);
      return 0;
    }
  }
@@ -568,7 +566,7 @@ int WebPSaveImage(const WebPDecBuffer* const buffer,
      format == RGBA || format == BGRA || format == ARGB ||
      format == rgbA || format == bgrA || format == Argb) {
 #ifdef HAVE_WINCODEC_H
-    ok &= WebPWritePNG(out_file_name, use_stdout, buffer);
+    ok &= WebPWritePNG(out_file, use_stdout, buffer);
 #else
    ok &= WebPWritePNG(fout, buffer);
 #endif
--- a/imageio/imageio_util.c
+++ b/imageio/imageio_util.c
@@ -137,11 +137,7 @@ void ImgIoUtilCopyPlane(const uint8_t* src, int src_stride,

 int ImgIoUtilCheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
  const uint64_t total_size = nmemb * size;
-  int ok = (total_size == (size_t)total_size);
-#if defined(WEBP_MAX_IMAGE_SIZE)
-  ok = ok && (total_size <= (uint64_t)WEBP_MAX_IMAGE_SIZE);
-#endif
-  return ok;
+  return (total_size == (size_t)total_size);
 }

 // -----------------------------------------------------------------------------
--- a/imageio/jpegdec.c
+++ b/imageio/jpegdec.c
@@ -304,18 +304,18 @@ int ReadJPEG(const uint8_t* const data, size_t data_size,

  if (stride != (int)stride ||
      !ImgIoUtilCheckSizeArgumentsOverflow(stride, height)) {
-    goto Error;
+    goto End;
  }

  rgb = (uint8_t*)malloc((size_t)stride * height);
  if (rgb == NULL) {
-    goto Error;
+    goto End;
  }
  buffer[0] = (JSAMPLE*)rgb;

  while (dinfo.output_scanline < dinfo.output_height) {
    if (jpeg_read_scanlines((j_decompress_ptr)&dinfo, buffer, 1) != 1) {
-      goto Error;
+      goto End;
    }
    buffer[0] += stride;
  }
--- a/imageio/pnmdec.c
+++ b/imageio/pnmdec.c
@@ -117,13 +117,8 @@ static size_t ReadPAMFields(PNMInfo* const info, size_t off) {
    }
  }
  if (!(info->seen_flags & TUPLE_FLAG)) {
-    if (info->depth > 0 && info->depth <= 4) {
-      info->seen_flags |= TUPLE_FLAG;
-      info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
-    } else {
-      fprintf(stderr, "PAM: invalid bitdepth (%d).\n", info->depth);
-      return 0;
-    }
+    info->seen_flags |= TUPLE_FLAG;
+    info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
  }
  if (info->seen_flags != ALL_NEEDED_FLAGS) {
    fprintf(stderr, "PAM: incomplete header.\n");
--- a/imageio/webpdec.c
+++ b/imageio/webpdec.c
@@ -9,10 +9,6 @@
 //
 // WebP decode.

-#ifdef HAVE_CONFIG_H
-#include "webp/config.h"
-#endif
-
 #include "./webpdec.h"

 #include <stdio.h>
@@ -145,32 +141,17 @@ int ReadWebP(const uint8_t* const data, size_t data_size,

  do {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
-    uint64_t stride;
    pic->width = bitstream->width;
    pic->height = bitstream->height;
-    if (pic->use_argb) {
-      stride = (uint64_t)bitstream->width * 4;
-    } else {
-      stride = (uint64_t)bitstream->width * (has_alpha ? 5 : 3) / 2;
-      pic->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
-    }
-
-    if (!ImgIoUtilCheckSizeArgumentsOverflow(stride, bitstream->height)) {
-      status = VP8_STATUS_OUT_OF_MEMORY;
-      break;
-    }
-
+    if (!pic->use_argb) pic->colorspace = has_alpha ? WEBP_YUV420A
+                                                    : WEBP_YUV420;
    ok = WebPPictureAlloc(pic);
    if (!ok) {
      status = VP8_STATUS_OUT_OF_MEMORY;
      break;
    }
    if (pic->use_argb) {
-#ifdef WORDS_BIGENDIAN
-      output_buffer->colorspace = MODE_ARGB;
-#else
      output_buffer->colorspace = MODE_BGRA;
-#endif
      output_buffer->u.RGBA.rgba = (uint8_t*)pic->argb;
      output_buffer->u.RGBA.stride = pic->argb_stride * sizeof(uint32_t);
      output_buffer->u.RGBA.size = output_buffer->u.RGBA.stride * pic->height;
--- a/makefile.unix
+++ b/makefile.unix
@@ -34,16 +34,6 @@ else
  GL_LIBS = -lglut -lGL
 endif

-# SDL flags: use sdl-config if it exists
-SDL_CONFIG = $(shell sdl-config --version 2> /dev/null)
-ifneq ($(SDL_CONFIG),)
-  SDL_LIBS = $(shell sdl-config --libs)
-  SDL_FLAGS = $(shell sdl-config --cflags)
-else
-  # use best-guess
-  SDL_LIBS = -lSDL
-  SDL_FLAGS =
-endif

 # To install libraries on Mac OS X:
 # 1. Install MacPorts (http://www.macports.org/install.php)
@@ -67,7 +57,7 @@ endif
 # EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES

 # Extra flags to enable byte swap for 16 bit colorspaces.
-# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP=1
+# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP

 # Extra flags to enable multi-threading
 EXTRA_FLAGS += -DWEBP_USE_THREAD
@@ -113,7 +103,7 @@ endif

 AR = ar
 ARFLAGS = r
-CPPFLAGS = -I. -Isrc/ -Wall
+CPPFLAGS = -Isrc/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
 CC = gcc
 INSTALL = install
@@ -183,6 +173,9 @@ DSP_DEC_OBJS = \
    src/dsp/yuv_sse2.o \

 DSP_ENC_OBJS = \
+    src/dsp/argb.o \
+    src/dsp/argb_mips_dsp_r2.o \
+    src/dsp/argb_sse2.o \
    src/dsp/cost.o \
    src/dsp/cost_mips32.o \
    src/dsp/cost_mips_dsp_r2.o \
@@ -342,8 +335,7 @@ OUT_LIBS += src/libwebp.a
 EXTRA_LIB = extras/libwebpextras.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
 EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux \
-                 examples/anim_diff examples/anim_dump \
-                 examples/img2webp examples/webpinfo
+                 examples/anim_diff examples/img2webp examples/webpinfo
 OTHER_EXAMPLES = extras/get_disto extras/webp_quality extras/vwebp_sdl

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
@@ -371,7 +363,7 @@ src/utils/bit_reader_utils.o: src/utils/endian_inl_utils.h
 src/utils/bit_writer_utils.o: src/utils/endian_inl_utils.h

 %.o: %.c $(HDRS)
-	$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@

 examples/libanim_util.a: $(ANIM_UTIL_OBJS)
 examples/libexample_util.a: $(EX_UTIL_OBJS)
@@ -389,7 +381,6 @@ src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)
 	$(AR) $(ARFLAGS) $@ $^

 examples/anim_diff: examples/anim_diff.o $(ANIM_UTIL_OBJS) $(GIFDEC_OBJS)
-examples/anim_dump: examples/anim_dump.o $(ANIM_UTIL_OBJS)
 examples/cwebp: examples/cwebp.o
 examples/dwebp: examples/dwebp.o
 examples/gif2webp: examples/gif2webp.o $(GIFDEC_OBJS)
@@ -403,13 +394,6 @@ examples/anim_diff: src/demux/libwebpdemux.a examples/libexample_util.a
 examples/anim_diff: imageio/libimageio_util.a src/libwebp.a
 examples/anim_diff: EXTRA_LIBS += $(GIF_LIBS)
 examples/anim_diff: EXTRA_FLAGS += -DWEBP_HAVE_GIF
-examples/anim_dump: examples/libanim_util.a
-examples/anim_dump: src/demux/libwebpdemux.a
-examples/anim_dump: examples/libexample_util.a
-examples/anim_dump: imageio/libimageio_util.a
-examples/anim_dump: imageio/libimageenc.a
-examples/anim_dump: src/libwebp.a
-examples/anim_dump: EXTRA_LIBS += $(GIF_LIBS) $(DWEBP_LIBS)
 examples/cwebp: examples/libexample_util.a
 examples/cwebp: imageio/libimagedec.a
 examples/cwebp: imageio/libimageio_util.a
@@ -450,8 +434,8 @@ extras/vwebp_sdl: extras/vwebp_sdl.o
 extras/vwebp_sdl: extras/webp_to_sdl.o
 extras/vwebp_sdl: imageio/libimageio_util.a
 extras/vwebp_sdl: src/libwebp.a
-extras/vwebp_sdl: EXTRA_FLAGS += -DWEBP_HAVE_SDL $(SDL_FLAGS)
-extras/vwebp_sdl: EXTRA_LIBS += $(SDL_LIBS)
+extras/vwebp_sdl: EXTRA_FLAGS += -DWEBP_HAVE_SDL
+extras/vwebp_sdl: EXTRA_LIBS += -lSDL

 $(OUT_EXAMPLES) $(EXTRA_EXAMPLES) $(OTHER_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "September 20, 2017"
+.TH GIF2WEBP 1 "January 25, 2017"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@@ -109,9 +109,6 @@ the range of 20 to 50.
 .TP
 .B \-mt
 Use multi-threading for encoding, if possible.
-.B \-loop_compatibility
-If enabled, handle the loop information in a compatible fashion for Chrome
-version prior to M62 (inclusive) and Firefox.
 .TP
 .B \-v
 Print extra information.
--- a/man/webpinfo.1
+++ b/man/webpinfo.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPINFO 1 "November 24, 2017"
+.TH WEBPINFO 1 "May 08, 2017"
 .SH NAME
 webpinfo \- print out the chunk level structure of WebP files
 along with basic integrity checks.
@@ -22,19 +22,16 @@ WebP format.

 .SH OPTIONS
 .TP
-.B \-version
-Print the version number (as major.minor.revision) and exit.
-.TP
-.B \-quiet
+.B -quiet
 Do not show chunk parsing information.
 .TP
-.B \-diag
+.B -diag
 Show parsing error diagnosis.
 .TP
-.B \-summary
+.B -summary
 Show chunk stats summary.
 .TP
-.BI \-bitstream_info
+.BI -bitstream_info
 Parse bitstream header.
 .TP
 .B \-h, \-help
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -22,7 +22,6 @@ commondir = $(includedir)/webp
 libwebp_la_SOURCES =
 libwebpinclude_HEADERS =
 libwebpinclude_HEADERS += webp/encode.h
-
 noinst_HEADERS =
 noinst_HEADERS += webp/format_constants.h

@@ -36,7 +35,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 7:1:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 7:0:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@@ -48,7 +47,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 3:1:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 3:0:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@@ -1,4 +1,3 @@
-AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
 noinst_LTLIBRARIES = libwebpdecode.la

 libwebpdecode_la_SOURCES =
--- a/src/dec/alpha_dec.c
+++ b/src/dec/alpha_dec.c
@@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "src/dec/alphai_dec.h"
-#include "src/dec/vp8i_dec.h"
-#include "src/dec/vp8li_dec.h"
-#include "src/dsp/dsp.h"
-#include "src/utils/quant_levels_dec_utils.h"
-#include "src/utils/utils.h"
-#include "src/webp/format_constants.h"
+#include "./alphai_dec.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "../dsp/dsp.h"
+#include "../utils/quant_levels_dec_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"

 //------------------------------------------------------------------------------
 // ALPHDecoder object.
--- a/src/dec/alphai_dec.h
+++ b/src/dec/alphai_dec.h
@@ -11,11 +11,11 @@
 //
 // Author: Urvang (urvang@google.com)

-#ifndef WEBP_DEC_ALPHAI_DEC_H_
-#define WEBP_DEC_ALPHAI_DEC_H_
+#ifndef WEBP_DEC_ALPHAI_H_
+#define WEBP_DEC_ALPHAI_H_

-#include "src/dec/webpi_dec.h"
-#include "src/utils/filters_utils.h"
+#include "./webpi_dec.h"
+#include "../utils/filters_utils.h"

 #ifdef __cplusplus
 extern "C" {
@@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_ALPHAI_DEC_H_ */
+#endif  /* WEBP_DEC_ALPHAI_H_ */
--- a/src/dec/buffer_dec.c
+++ b/src/dec/buffer_dec.c
@@ -13,15 +13,15 @@

 #include <stdlib.h>

-#include "src/dec/vp8i_dec.h"
-#include "src/dec/webpi_dec.h"
-#include "src/utils/utils.h"
+#include "./vp8i_dec.h"
+#include "./webpi_dec.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------
 // WebPDecBuffer

 // Number of bytes per pixel for the different color-spaces.
-static const uint8_t kModeBpp[MODE_LAST] = {
+static const int kModeBpp[MODE_LAST] = {
  3, 4, 3, 4, 4, 2, 2,
  4, 4, 4, 2,    // pre-multiplied modes
  1, 1 };
@@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
 // strictly speaking, the very last (or first, if flipped) row
 // doesn't require padding.
 #define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
-    ((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)

 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
@@ -74,8 +74,7 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
-    const uint64_t size =
-        MIN_BUFFER_SIZE(width * kModeBpp[mode], height, stride);
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
@@ -99,14 +98,9 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
    uint64_t uv_size = 0, a_size = 0, total_size;
    // We need memory and it hasn't been allocated yet.
    // => initialize output buffer, now that dimensions are known.
-    int stride;
-    uint64_t size;
+    const int stride = w * kModeBpp[mode];
+    const uint64_t size = (uint64_t)stride * h;

-    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
-      return VP8_STATUS_INVALID_PARAM;
-    }
-    stride = w * kModeBpp[mode];
-    size = (uint64_t)stride * h;
    if (!WebPIsRGBMode(mode)) {
      uv_stride = (w + 1) / 2;
      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
@@ -175,11 +169,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
  return VP8_STATUS_OK;
 }

-VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
                                    const WebPDecoderOptions* const options,
-                                    WebPDecBuffer* const buffer) {
+                                    WebPDecBuffer* const out) {
  VP8StatusCode status;
-  if (buffer == NULL || width <= 0 || height <= 0) {
+  if (out == NULL || w <= 0 || h <= 0) {
    return VP8_STATUS_INVALID_PARAM;
  }
  if (options != NULL) {    // First, apply options if there is any.
@@ -188,39 +182,33 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
      const int ch = options->crop_height;
      const int x = options->crop_left & ~1;
      const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
-          x + cw > width || y + ch > height) {
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
      }
-      width = cw;
-      height = ch;
+      w = cw;
+      h = ch;
    }
-
    if (options->use_scaling) {
-#if !defined(WEBP_REDUCE_SIZE)
      int scaled_width = options->scaled_width;
      int scaled_height = options->scaled_height;
      if (!WebPRescalerGetScaledDimensions(
-              width, height, &scaled_width, &scaled_height)) {
+              w, h, &scaled_width, &scaled_height)) {
        return VP8_STATUS_INVALID_PARAM;
      }
-      width = scaled_width;
-      height = scaled_height;
-#else
-      return VP8_STATUS_INVALID_PARAM;   // rescaling not supported
-#endif
+      w = scaled_width;
+      h = scaled_height;
    }
  }
-  buffer->width = width;
-  buffer->height = height;
+  out->width = w;
+  out->height = h;

  // Then, allocate buffer for real.
-  status = AllocateBuffer(buffer);
+  status = AllocateBuffer(out);
  if (status != VP8_STATUS_OK) return status;

  // Use the stride trick if vertical flip is needed.
  if (options != NULL && options->flip) {
-    status = WebPFlipBuffer(buffer);
+    status = WebPFlipBuffer(out);
  }
  return status;
 }
--- a/src/dec/common_dec.h
+++ b/src/dec/common_dec.h
@@ -11,8 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_DEC_COMMON_DEC_H_
-#define WEBP_DEC_COMMON_DEC_H_
+#ifndef WEBP_DEC_COMMON_H_
+#define WEBP_DEC_COMMON_H_

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
       NUM_PROBAS = 11
     };

-#endif    // WEBP_DEC_COMMON_DEC_H_
+#endif    // WEBP_DEC_COMMON_H_
--- a/src/dec/frame_dec.c
+++ b/src/dec/frame_dec.c
@@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "src/dec/vp8i_dec.h"
-#include "src/utils/utils.h"
+#include "./vp8i_dec.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------
 // Main reconstruction function.

-static const uint16_t kScan[16] = {
+static const int kScan[16] = {
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
@@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 #define MIN_DITHER_AMP 4

 #define DITHER_AMP_TAB_SIZE 12
-static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
  // roughly, it's dqm->uv_mat_[1]
  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
 };
@@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  }

  mem = (uint8_t*)dec->mem_;
-  dec->intra_t_ = mem;
+  dec->intra_t_ = (uint8_t*)mem;
  mem += intra_pred_mode_size;

  dec->yuv_t_ = (VP8TopSamples*)mem;
@@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {

  mem = (uint8_t*)WEBP_ALIGN(mem);
  assert((yuv_size & WEBP_ALIGN_CST) == 0);
-  dec->yuv_b_ = mem;
+  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;

  dec->mb_data_ = (VP8MBData*)mem;
@@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
    const int extra_rows = kFilterExtraRows[dec->filter_type_];
    const int extra_y = extra_rows * dec->cache_y_stride_;
    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
-    dec->cache_y_ = mem + extra_y;
+    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
    dec->cache_u_ = dec->cache_y_
                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
    dec->cache_v_ = dec->cache_u_
@@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += cache_size;

  // alpha plane
-  dec->alpha_plane_ = alpha_size ? mem : NULL;
+  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
  mem += alpha_size;
  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

--- a/src/dec/idec_dec.c
+++ b/src/dec/idec_dec.c
@@ -15,10 +15,10 @@
 #include <string.h>
 #include <stdlib.h>

-#include "src/dec/alphai_dec.h"
-#include "src/dec/webpi_dec.h"
-#include "src/dec/vp8i_dec.h"
-#include "src/utils/utils.h"
+#include "./alphai_dec.h"
+#include "./webpi_dec.h"
+#include "./vp8i_dec.h"
+#include "../utils/utils.h"

 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@@ -283,8 +283,10 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,

 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
  if (idec->state_ == STATE_VP8_DATA) {
-    // Synchronize the thread, clean-up and check for errors.
-    VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+    VP8Io* const io = &idec->io_;
+    if (io->teardown != NULL) {
+      io->teardown(io);
+    }
  }
  idec->state_ = STATE_ERROR;
  return error;
@@ -449,10 +451,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  VP8Io* const io = &idec->io_;

-  // Make sure partition #0 has been read before, to set dec to ready_.
-  if (!dec->ready_) {
-    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
-  }
+  assert(dec->ready_);
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
    if (idec->last_mb_y_ != dec->mb_y_) {
      if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
@@ -492,7 +491,6 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  }
  // Synchronize the thread and check for errors.
  if (!VP8ExitCritical(dec, io)) {
-    idec->state_ = STATE_ERROR;  // prevent re-entry in IDecError
    return IDecError(idec, VP8_STATUS_USER_ABORT);
  }
  dec->ready_ = 0;
@@ -573,10 +571,6 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
    status = DecodePartition0(idec);
  }
  if (idec->state_ == STATE_VP8_DATA) {
-    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
-    if (dec == NULL) {
-      return VP8_STATUS_SUSPENDED;  // can't continue if we have no decoder.
-    }
    status = DecodeRemaining(idec);
  }
  if (idec->state_ == STATE_VP8L_HEADER) {
@@ -679,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Wrapper toward WebPINewDecoder

-WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
+WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                          size_t output_buffer_size, int output_stride) {
  const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
  WebPIDecoder* idec;

-  if (csp >= MODE_YUV) return NULL;
+  if (mode >= MODE_YUV) return NULL;
  if (is_external_memory == 0) {    // Overwrite parameters to sane values.
    output_buffer_size = 0;
    output_stride = 0;
@@ -695,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
  }
  idec = WebPINewDecoder(NULL);
  if (idec == NULL) return NULL;
-  idec->output_.colorspace = csp;
+  idec->output_.colorspace = mode;
  idec->output_.is_external_memory = is_external_memory;
  idec->output_.u.RGBA.rgba = output_buffer;
  idec->output_.u.RGBA.stride = output_stride;
--- a/src/dec/io_dec.c
+++ b/src/dec/io_dec.c
@@ -13,11 +13,11 @@

 #include <assert.h>
 #include <stdlib.h>
-#include "src/dec/vp8i_dec.h"
-#include "src/dec/webpi_dec.h"
-#include "src/dsp/dsp.h"
-#include "src/dsp/yuv.h"
-#include "src/utils/utils.h"
+#include "../dec/vp8i_dec.h"
+#include "./webpi_dec.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
    int num_rows;
    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-#if (WEBP_SWAP_16BIT_CSP == 1)
+#ifdef WEBP_SWAP_16BIT_CSP
    uint8_t* alpha_dst = base_rgba;
 #else
    uint8_t* alpha_dst = base_rgba + 1;
@@ -241,7 +241,6 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
 //------------------------------------------------------------------------------
 // YUV rescaling (no final RGB conversion needed)

-#if !defined(WEBP_REDUCE_SIZE)
 static int Rescale(const uint8_t* src, int src_stride,
                   int new_lines, WebPRescaler* const wrk) {
  int num_lines_out = 0;
@@ -432,7 +431,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                               int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
-#if (WEBP_SWAP_16BIT_CSP == 1)
+#ifdef WEBP_SWAP_16BIT_CSP
  uint8_t* alpha_dst = base_rgba;
 #else
  uint8_t* alpha_dst = base_rgba + 1;
@@ -542,8 +541,6 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  return 1;
 }

-#endif  // WEBP_REDUCE_SIZE
-
 //------------------------------------------------------------------------------
 // Default custom functions

@@ -564,14 +561,10 @@ static int CustomSetup(VP8Io* io) {
    WebPInitUpsamplers();
  }
  if (io->use_scaling) {
-#if !defined(WEBP_REDUCE_SIZE)
    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
    if (!ok) {
      return 0;    // memory error
    }
-#else
-    return 0;   // rescaling support not compiled
-#endif
  } else {
    if (is_rgb) {
      WebPInitSamplers();
@@ -605,6 +598,9 @@ static int CustomSetup(VP8Io* io) {
    }
  }

+  if (is_rgb) {
+    VP8YUVInit();
+  }
  return 1;
 }

--- a/src/dec/quant_dec.c
+++ b/src/dec/quant_dec.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dec/vp8i_dec.h"
+#include "./vp8i_dec.h"

 static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
--- a/src/dec/tree_dec.c
+++ b/src/dec/tree_dec.c
@@ -11,19 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dec/vp8i_dec.h"
-#include "src/utils/bit_reader_inl_utils.h"
+#include "./vp8i_dec.h"
+#include "../utils/bit_reader_inl_utils.h"

-#if !defined(USE_GENERIC_TREE)
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
-#define USE_GENERIC_TREE 1   // ALTERNATE_CODE
-#else
-#define USE_GENERIC_TREE 0
+#define USE_GENERIC_TREE
 #endif
-#endif  // USE_GENERIC_TREE

-#if (USE_GENERIC_TREE == 1)
+#ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
    -B_TM_PRED, 2,
@@ -321,7 +317,7 @@ static void ParseIntraMode(VP8BitReader* const br,
      int x;
      for (x = 0; x < 4; ++x) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
-#if (USE_GENERIC_TREE == 1)
+#ifdef USE_GENERIC_TREE
        // Generic tree-parsing
        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
        while (i > 0) {
@@ -339,7 +335,7 @@ static void ParseIntraMode(VP8BitReader* const br,
                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
-#endif  // USE_GENERIC_TREE
+#endif    // USE_GENERIC_TREE
        top[x] = ymode;
      }
      memcpy(modes, top, 4 * sizeof(*top));
@@ -502,7 +498,7 @@ static const uint8_t

 // Paragraph 9.9

-static const uint8_t kBands[16 + 1] = {
+static const int kBands[16 + 1] = {
  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  0  // extra entry as sentinel
 };
--- a/src/dec/vp8_dec.c
+++ b/src/dec/vp8_dec.c
@@ -13,12 +13,12 @@

 #include <stdlib.h>

-#include "src/dec/alphai_dec.h"
-#include "src/dec/vp8i_dec.h"
-#include "src/dec/vp8li_dec.h"
-#include "src/dec/webpi_dec.h"
-#include "src/utils/bit_reader_inl_utils.h"
-#include "src/utils/utils.h"
+#include "./alphai_dec.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "./webpi_dec.h"
+#include "../utils/bit_reader_inl_utils.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------

--- a/src/dec/vp8_dec.h
+++ b/src/dec/vp8_dec.h
@@ -11,10 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_DEC_VP8_DEC_H_
-#define WEBP_DEC_VP8_DEC_H_
+#ifndef WEBP_WEBP_DECODE_VP8_H_
+#define WEBP_WEBP_DECODE_VP8_H_

-#include "src/webp/decode.h"
+#include "../webp/decode.h"

 #ifdef __cplusplus
 extern "C" {
@@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
 // Miscellaneous VP8/VP8L bitstream probing functions.

 // Returns true if the next 3 bytes in data contain the VP8 signature.
-WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
+WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);

 // Validates the VP8 data-header and retrieves basic header information viz
 // width and height. Returns 0 in case of formatting error. *width/*height
 // can be passed NULL.
-WEBP_EXTERN int VP8GetInfo(
+WEBP_EXTERN(int) VP8GetInfo(
    const uint8_t* data,
    size_t data_size,    // data available so far
    size_t chunk_size,   // total data size expected in the chunk
    int* const width, int* const height);

 // Returns true if the next byte(s) in data is a VP8L signature.
-WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
+WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);

 // Validates the VP8L data-header and retrieves basic header information viz
 // width, height and alpha. Returns 0 in case of formatting error.
 // width/height/has_alpha can be passed NULL.
-WEBP_EXTERN int VP8LGetInfo(
+WEBP_EXTERN(int) VP8LGetInfo(
    const uint8_t* data, size_t data_size,  // data available so far
    int* const width, int* const height, int* const has_alpha);

@@ -182,4 +182,4 @@ WEBP_EXTERN int VP8LGetInfo(
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_VP8_DEC_H_ */
+#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
--- a/src/dec/vp8i_dec.h
+++ b/src/dec/vp8i_dec.h
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_DEC_VP8I_DEC_H_
-#define WEBP_DEC_VP8I_DEC_H_
+#ifndef WEBP_DEC_VP8I_H_
+#define WEBP_DEC_VP8I_H_

 #include <string.h>     // for memcpy()
-#include "src/dec/common_dec.h"
-#include "src/dec/vp8li_dec.h"
-#include "src/utils/bit_reader_utils.h"
-#include "src/utils/random_utils.h"
-#include "src/utils/thread_utils.h"
-#include "src/dsp/dsp.h"
+#include "./common_dec.h"
+#include "./vp8li_dec.h"
+#include "../utils/bit_reader_utils.h"
+#include "../utils/random_utils.h"
+#include "../utils/thread_utils.h"
+#include "../dsp/dsp.h"

 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 6
-#define DEC_REV_VERSION 1
+#define DEC_REV_VERSION 0

 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
@@ -57,6 +57,7 @@ extern "C" {
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
 #define YUV_SIZE (BPS * 17 + BPS * 9)
+#define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
@@ -316,4 +317,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_VP8I_DEC_H_ */
+#endif  /* WEBP_DEC_VP8I_H_ */
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@@ -14,22 +14,22 @@

 #include <stdlib.h>

-#include "src/dec/alphai_dec.h"
-#include "src/dec/vp8li_dec.h"
-#include "src/dsp/dsp.h"
-#include "src/dsp/lossless.h"
-#include "src/dsp/lossless_common.h"
-#include "src/dsp/yuv.h"
-#include "src/utils/endian_inl_utils.h"
-#include "src/utils/huffman_utils.h"
-#include "src/utils/utils.h"
+#include "./alphai_dec.h"
+#include "./vp8li_dec.h"
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../dsp/yuv.h"
+#include "../utils/endian_inl_utils.h"
+#include "../utils/huffman_utils.h"
+#include "../utils/utils.h"

 #define NUM_ARGB_CACHE_ROWS          16

 static const int kCodeLengthLiterals = 16;
 static const int kCodeLengthRepeatCode = 16;
-static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
-static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
+static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
+static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };

 // -----------------------------------------------------------------------------
 //  Five Huffman codes are used at each meta code:
@@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
 // http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
-static const uint16_t kTableSize[12] = {
+static const int kTableSize[12] = {
  FIXED_TABLE_SIZE + 654,
  FIXED_TABLE_SIZE + 656,
  FIXED_TABLE_SIZE + 658,
@@ -253,11 +253,11 @@ static int ReadHuffmanCodeLengths(
  int symbol;
  int max_symbol;
  int prev_code_len = DEFAULT_CODE_LENGTH;
-  HuffmanTables tables;
+  HuffmanCode table[1 << LENGTHS_TABLE_BITS];

-  if (!VP8LHuffmanTablesAllocate(1 << LENGTHS_TABLE_BITS, &tables) ||
-      !VP8LBuildHuffmanTable(&tables, LENGTHS_TABLE_BITS,
-                             code_length_code_lengths, NUM_CODE_LENGTH_CODES)) {
+  if (!VP8LBuildHuffmanTable(table, LENGTHS_TABLE_BITS,
+                             code_length_code_lengths,
+                             NUM_CODE_LENGTH_CODES)) {
    goto End;
  }

@@ -277,7 +277,7 @@ static int ReadHuffmanCodeLengths(
    int code_len;
    if (max_symbol-- == 0) break;
    VP8LFillBitWindow(br);
-    p = &tables.curr_segment->start[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
+    p = &table[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
    VP8LSetBitPos(br, br->bit_pos_ + p->bits);
    code_len = p->value;
    if (code_len < kCodeLengthLiterals) {
@@ -300,7 +300,6 @@ static int ReadHuffmanCodeLengths(
  ok = 1;

 End:
-  VP8LHuffmanTablesDeallocate(&tables);
  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
  return ok;
 }
@@ -308,8 +307,7 @@ static int ReadHuffmanCodeLengths(
 // 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
 // tree.
 static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
-                           int* const code_lengths,
-                           HuffmanTables* const table) {
+                           int* const code_lengths, HuffmanCode* const table) {
  int ok = 0;
  int size = 0;
  VP8LBitReader* const br = &dec->br_;
@@ -364,18 +362,12 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
  VP8LMetadata* const hdr = &dec->hdr_;
  uint32_t* huffman_image = NULL;
  HTreeGroup* htree_groups = NULL;
-  HuffmanTables* huffman_tables = &hdr->huffman_tables_;
+  HuffmanCode* huffman_tables = NULL;
+  HuffmanCode* next = NULL;
  int num_htree_groups = 1;
-  int num_htree_groups_max = 1;
  int max_alphabet_size = 0;
  int* code_lengths = NULL;
  const int table_size = kTableSize[color_cache_bits];
-  int* mapping = NULL;
-  int ok = 0;
-
-  // Check the table has been 0 initialized (through InitMetadata).
-  assert(huffman_tables->root.start == NULL);
-  assert(huffman_tables->curr_segment == NULL);

  if (allow_recursion && VP8LReadBits(br, 1)) {
    // use meta Huffman codes.
@@ -392,36 +384,10 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
      // The huffman data is stored in red and green bytes.
      const int group = (huffman_image[i] >> 8) & 0xffff;
      huffman_image[i] = group;
-      if (group >= num_htree_groups_max) {
-        num_htree_groups_max = group + 1;
+      if (group >= num_htree_groups) {
+        num_htree_groups = group + 1;
      }
    }
-    // Check the validity of num_htree_groups_max. If it seems too big, use a
-    // smaller value for later. This will prevent big memory allocations to end
-    // up with a bad bitstream anyway.
-    // The value of 1000 is totally arbitrary. We know that num_htree_groups_max
-    // is smaller than (1 << 16) and should be smaller than the number of pixels
-    // (though the format allows it to be bigger).
-    if (num_htree_groups_max > 1000 || num_htree_groups_max > xsize * ysize) {
-      // Create a mapping from the used indices to the minimal set of used
-      // values [0, num_htree_groups)
-      mapping = (int*)WebPSafeMalloc(num_htree_groups_max, sizeof(*mapping));
-      if (mapping == NULL) {
-        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-        goto Error;
-      }
-      // -1 means a value is unmapped, and therefore unused in the Huffman
-      // image.
-      memset(mapping, 0xff, num_htree_groups_max * sizeof(*mapping));
-      for (num_htree_groups = 0, i = 0; i < huffman_pixs; ++i) {
-        // Get the current mapping for the group and remap the Huffman image.
-        int* const mapped_group = &mapping[huffman_image[i]];
-        if (*mapped_group == -1) *mapped_group = num_htree_groups++;
-        huffman_image[i] = *mapped_group;
-      }
-    } else {
-      num_htree_groups = num_htree_groups_max;
-    }
  }

  if (br->eos_) goto Error;
@@ -437,105 +403,88 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
    }
  }

+  huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size,
+                                                sizeof(*huffman_tables));
  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
                                      sizeof(*code_lengths));

-  if (htree_groups == NULL || code_lengths == NULL ||
-      !VP8LHuffmanTablesAllocate(num_htree_groups * table_size,
-                                 huffman_tables)) {
+  if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
    goto Error;
  }

-  for (i = 0; i < num_htree_groups_max; ++i) {
-    // If the index "i" is unused in the Huffman image, just make sure the
-    // coefficients are valid but do not store them.
-    if (mapping != NULL && mapping[i] == -1) {
-      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-        int alphabet_size = kAlphabetSize[j];
-        if (j == 0 && color_cache_bits > 0) {
-          alphabet_size += (1 << color_cache_bits);
-        }
-        // Passing in NULL so that nothing gets filled.
-        if (!ReadHuffmanCode(alphabet_size, dec, code_lengths, NULL)) {
-          goto Error;
-        }
+  next = huffman_tables;
+  for (i = 0; i < num_htree_groups; ++i) {
+    HTreeGroup* const htree_group = &htree_groups[i];
+    HuffmanCode** const htrees = htree_group->htrees;
+    int size;
+    int total_size = 0;
+    int is_trivial_literal = 1;
+    int max_bits = 0;
+    for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+      int alphabet_size = kAlphabetSize[j];
+      htrees[j] = next;
+      if (j == 0 && color_cache_bits > 0) {
+        alphabet_size += 1 << color_cache_bits;
      }
-    } else {
-      HTreeGroup* const htree_group =
-          &htree_groups[(mapping == NULL) ? i : mapping[i]];
-      HuffmanCode** const htrees = htree_group->htrees;
-      int size;
-      int total_size = 0;
-      int is_trivial_literal = 1;
-      int max_bits = 0;
-      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-        int alphabet_size = kAlphabetSize[j];
-        if (j == 0 && color_cache_bits > 0) {
-          alphabet_size += (1 << color_cache_bits);
-        }
-        size =
-            ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables);
-        htrees[j] = huffman_tables->curr_segment->curr_table;
-        if (size == 0) {
-          goto Error;
-        }
-        if (is_trivial_literal && kLiteralMap[j] == 1) {
-          is_trivial_literal = (htrees[j]->bits == 0);
-        }
-        total_size += htrees[j]->bits;
-        huffman_tables->curr_segment->curr_table += size;
-        if (j <= ALPHA) {
-          int local_max_bits = code_lengths[0];
-          int k;
-          for (k = 1; k < alphabet_size; ++k) {
-            if (code_lengths[k] > local_max_bits) {
-              local_max_bits = code_lengths[k];
-            }
+      size = ReadHuffmanCode(alphabet_size, dec, code_lengths, next);
+      if (size == 0) {
+        goto Error;
+      }
+      if (is_trivial_literal && kLiteralMap[j] == 1) {
+        is_trivial_literal = (next->bits == 0);
+      }
+      total_size += next->bits;
+      next += size;
+      if (j <= ALPHA) {
+        int local_max_bits = code_lengths[0];
+        int k;
+        for (k = 1; k < alphabet_size; ++k) {
+          if (code_lengths[k] > local_max_bits) {
+            local_max_bits = code_lengths[k];
          }
-          max_bits += local_max_bits;
        }
+        max_bits += local_max_bits;
      }
-      htree_group->is_trivial_literal = is_trivial_literal;
-      htree_group->is_trivial_code = 0;
-      if (is_trivial_literal) {
-        const int red = htrees[RED][0].value;
-        const int blue = htrees[BLUE][0].value;
-        const int alpha = htrees[ALPHA][0].value;
-        htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
-        if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
-          htree_group->is_trivial_code = 1;
-          htree_group->literal_arb |= htrees[GREEN][0].value << 8;
-        }
-      }
-      htree_group->use_packed_table =
-          !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
-      if (htree_group->use_packed_table) BuildPackedTable(htree_group);
    }
+    htree_group->is_trivial_literal = is_trivial_literal;
+    htree_group->is_trivial_code = 0;
+    if (is_trivial_literal) {
+      const int red = htrees[RED][0].value;
+      const int blue = htrees[BLUE][0].value;
+      const int alpha = htrees[ALPHA][0].value;
+      htree_group->literal_arb =
+          ((uint32_t)alpha << 24) | (red << 16) | blue;
+      if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
+        htree_group->is_trivial_code = 1;
+        htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+      }
+    }
+    htree_group->use_packed_table = !htree_group->is_trivial_code &&
+                                    (max_bits < HUFFMAN_PACKED_BITS);
+    if (htree_group->use_packed_table) BuildPackedTable(htree_group);
  }
-  ok = 1;
+  WebPSafeFree(code_lengths);

-  // All OK. Finalize pointers.
+  // All OK. Finalize pointers and return.
  hdr->huffman_image_ = huffman_image;
  hdr->num_htree_groups_ = num_htree_groups;
  hdr->htree_groups_ = htree_groups;
+  hdr->huffman_tables_ = huffman_tables;
+  return 1;

 Error:
  WebPSafeFree(code_lengths);
-  WebPSafeFree(mapping);
-  if (!ok) {
-    WebPSafeFree(huffman_image);
-    VP8LHuffmanTablesDeallocate(huffman_tables);
-    VP8LHtreeGroupsFree(htree_groups);
-  }
-  return ok;
+  WebPSafeFree(huffman_image);
+  WebPSafeFree(huffman_tables);
+  VP8LHtreeGroupsFree(htree_groups);
+  return 0;
 }

 //------------------------------------------------------------------------------
 // Scaling.

-#if !defined(WEBP_REDUCE_SIZE)
 static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
  const int num_channels = 4;
  const int in_width = io->mb_w;
@@ -567,13 +516,10 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
                   out_width, out_height, 0, num_channels, work);
  return 1;
 }
-#endif   // WEBP_REDUCE_SIZE

 //------------------------------------------------------------------------------
 // Export to ARGB

-#if !defined(WEBP_REDUCE_SIZE)
-
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                  int rgba_stride, uint8_t* const rgba) {
@@ -615,8 +561,6 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
  return num_lines_out;
 }

-#endif   // WEBP_REDUCE_SIZE
-
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
                    const uint8_t* row_in, int in_stride,
@@ -802,12 +746,9 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
      if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
        const WebPRGBABuffer* const buf = &output->u.RGBA;
        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
-        const int num_rows_out =
-#if !defined(WEBP_REDUCE_SIZE)
-         io->use_scaling ?
+        const int num_rows_out = io->use_scaling ?
            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
                                 rgba, buf->stride) :
-#endif  // WEBP_REDUCE_SIZE
            EmitRows(output->colorspace, rows_data, in_stride,
                     io->mb_w, io->mb_h, rgba, buf->stride);
        // Update 'last_out_row_'.
@@ -934,11 +875,7 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
 #endif
        break;
      case 2:
-#if !defined(WORDS_BIGENDIAN)
        memcpy(&pattern, src, sizeof(uint16_t));
-#else
-        pattern = ((uint32_t)src[0] << 8) | src[1];
-#endif
 #if defined(__arm__) || defined(_M_ARM)
        pattern |= pattern << 16;
 #elif defined(WEBP_USE_MIPS_DSP_R2)
@@ -1237,20 +1174,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
  }

  br->eos_ = VP8LIsEndOfStream(br);
-  // In incremental decoding:
-  // br->eos_ && src < src_last: if 'br' reached the end of the buffer and
-  // 'src_last' has not been reached yet, there is not enough data. 'dec' has to
-  // be reset until there is more data.
-  // !br->eos_ && src < src_last: this cannot happen as either the buffer is
-  // fully read, either enough has been read to reach 'src_last'.
-  // src >= src_last: 'src_last' is reached, all is fine. 'src' can actually go
-  // beyond 'src_last' in case the image is cropped and an LZ77 goes further.
-  // The buffer might have been enough or there is some left. 'br->eos_' does
-  // not matter.
-  assert(!dec->incremental_ || (br->eos_ && src < src_last) || src >= src_last);
-  if (dec->incremental_ && br->eos_ && src < src_last) {
+  if (dec->incremental_ && br->eos_ && src < src_end) {
    RestoreState(dec);
-  } else if ((dec->incremental_ && src >= src_last) || !br->eos_) {
+  } else if (!br->eos_) {
    // Process the remaining rows corresponding to last row-block.
    if (process_func != NULL) {
      process_func(dec, row > last_row ? last_row : row);
@@ -1369,7 +1295,7 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
  assert(hdr != NULL);

  WebPSafeFree(hdr->huffman_image_);
-  VP8LHuffmanTablesDeallocate(&hdr->huffman_tables_);
+  WebPSafeFree(hdr->huffman_tables_);
  VP8LHtreeGroupsFree(hdr->htree_groups_);
  VP8LColorCacheClear(&hdr->color_cache_);
  VP8LColorCacheClear(&hdr->saved_color_cache_);
@@ -1685,7 +1611,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
  // Sanity checks.
  if (dec == NULL) return 0;

-  assert(dec->hdr_.huffman_tables_.root.start != NULL);
+  assert(dec->hdr_.huffman_tables_ != NULL);
  assert(dec->hdr_.htree_groups_ != NULL);
  assert(dec->hdr_.num_htree_groups_ > 0);

@@ -1706,19 +1632,12 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

    if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;

-#if !defined(WEBP_REDUCE_SIZE)
    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;

    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
      // need the alpha-multiply functions for premultiplied output or rescaling
      WebPInitAlphaProcessing();
    }
-#else
-    if (io->use_scaling) {
-      dec->status_ = VP8_STATUS_INVALID_PARAM;
-      goto Err;
-    }
-#endif
    if (!WebPIsRGBMode(dec->output_->colorspace)) {
      WebPInitConvertARGBToYUV();
      if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
--- a/src/dec/vp8li_dec.h
+++ b/src/dec/vp8li_dec.h
@@ -12,14 +12,14 @@
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora(vikaas.arora@gmail.com)

-#ifndef WEBP_DEC_VP8LI_DEC_H_
-#define WEBP_DEC_VP8LI_DEC_H_
+#ifndef WEBP_DEC_VP8LI_H_
+#define WEBP_DEC_VP8LI_H_

 #include <string.h>     // for memcpy()
-#include "src/dec/webpi_dec.h"
-#include "src/utils/bit_reader_utils.h"
-#include "src/utils/color_cache_utils.h"
-#include "src/utils/huffman_utils.h"
+#include "./webpi_dec.h"
+#include "../utils/bit_reader_utils.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/huffman_utils.h"

 #ifdef __cplusplus
 extern "C" {
@@ -51,7 +51,7 @@ typedef struct {
  uint32_t       *huffman_image_;
  int             num_htree_groups_;
  HTreeGroup     *htree_groups_;
-  HuffmanTables   huffman_tables_;
+  HuffmanCode    *huffman_tables_;
 } VP8LMetadata;

 typedef struct VP8LDecoder VP8LDecoder;
@@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_VP8LI_DEC_H_ */
+#endif  /* WEBP_DEC_VP8LI_H_ */
--- a/src/dec/webp_dec.c
+++ b/src/dec/webp_dec.c
@@ -13,11 +13,11 @@

 #include <stdlib.h>

-#include "src/dec/vp8i_dec.h"
-#include "src/dec/vp8li_dec.h"
-#include "src/dec/webpi_dec.h"
-#include "src/utils/utils.h"
-#include "src/webp/mux_types.h"  // ALPHA_FLAG
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "./webpi_dec.h"
+#include "../utils/utils.h"
+#include "../webp/mux_types.h"  // ALPHA_FLAG

 //------------------------------------------------------------------------------
 // RIFF layout is:
@@ -421,9 +421,7 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
                                NULL, NULL, NULL, &has_animation,
                                NULL, headers);
  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    // The WebPDemux API + libwebp can be used to decode individual
-    // uncomposited frames or the WebPAnimDecoder can be used to fully
-    // reconstruct them (see webp/demux.h).
+    // TODO(jzern): full support of animation frames will require API additions.
    if (has_animation) {
      status = VP8_STATUS_UNSUPPORTED_FEATURE;
    }
--- a/src/dec/webpi_dec.h
+++ b/src/dec/webpi_dec.h
@@ -11,15 +11,15 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#ifndef WEBP_DEC_WEBPI_DEC_H_
-#define WEBP_DEC_WEBPI_DEC_H_
+#ifndef WEBP_DEC_WEBPI_H_
+#define WEBP_DEC_WEBPI_H_

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "src/utils/rescaler_utils.h"
-#include "src/dec/vp8_dec.h"
+#include "../utils/rescaler_utils.h"
+#include "./vp8_dec.h"

 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
 }    // extern "C"
 #endif

-#endif  /* WEBP_DEC_WEBPI_DEC_H_ */
+#endif  /* WEBP_DEC_WEBPI_H_ */
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@@ -1,4 +1,3 @@
-AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
 lib_LTLIBRARIES = libwebpdemux.la

 libwebpdemux_la_SOURCES =
@@ -10,6 +9,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:3:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:2:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/anim_decode.c
+++ b/src/demux/anim_decode.c
@@ -11,15 +11,15 @@
 //

 #ifdef HAVE_CONFIG_H
-#include "src/webp/config.h"
+#include "../webp/config.h"
 #endif

 #include <assert.h>
 #include <string.h>

-#include "src/utils/utils.h"
-#include "src/webp/decode.h"
-#include "src/webp/demux.h"
+#include "../utils/utils.h"
+#include "../webp/decode.h"
+#include "../webp/demux.h"

 #define NUM_CHANNELS 4

--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@@ -11,21 +11,21 @@
 //

 #ifdef HAVE_CONFIG_H
-#include "src/webp/config.h"
+#include "../webp/config.h"
 #endif

 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>

-#include "src/utils/utils.h"
-#include "src/webp/decode.h"     // WebPGetFeatures
-#include "src/webp/demux.h"
-#include "src/webp/format_constants.h"
+#include "../utils/utils.h"
+#include "../webp/decode.h"     // WebPGetFeatures
+#include "../webp/demux.h"
+#include "../webp/format_constants.h"

 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 3
+#define DMUX_REV_VERSION 2

 typedef struct {
  size_t start_;        // start location of the data
@@ -205,14 +205,12 @@ static void SetFrameInfo(size_t start_offset, size_t size,
  frame->complete_ = complete;
 }

-// Store image bearing chunks to 'frame'. 'min_size' is an optional size
-// requirement, it may be zero.
+// Store image bearing chunks to 'frame'.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
                              MemBuffer* const mem, Frame* const frame) {
  int alpha_chunks = 0;
  int image_chunks = 0;
-  int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
-              MemDataSize(mem) < min_size);
+  int done = (MemDataSize(mem) < min_size);
  ParseStatus status = PARSE_OK;

  if (done) return PARSE_NEED_MORE_DATA;
@@ -403,9 +401,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
  frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
  if (frame == NULL) return PARSE_ERROR;

-  // For the single image case we allow parsing of a partial frame, so no
-  // minimum size is imposed here.
-  status = StoreFrame(1, 0, &dmux->mem_, frame);
+  // For the single image case we allow parsing of a partial frame, but we need
+  // at least CHUNK_HEADER_SIZE for parsing.
+  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
  if (status != PARSE_ERROR) {
    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
    // Clear any alpha when the alpha flag is missing.
--- a/src/demux/libwebpdemux.rc
+++ b/src/demux/libwebpdemux.rc
@@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,3,0,3
- PRODUCTVERSION 0,3,0,3
+ FILEVERSION 0,3,0,2
+ PRODUCTVERSION 0,3,0,2
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebpdemux DLL"
-            VALUE "FileVersion", "0.3.3"
+            VALUE "FileVersion", "0.3.2"
            VALUE "InternalName", "libwebpdemux.dll"
            VALUE "LegalCopyright", "Copyright (C) 2017"
            VALUE "OriginalFilename", "libwebpdemux.dll"
            VALUE "ProductName", "WebP Image Demuxer"
-            VALUE "ProductVersion", "0.3.3"
+            VALUE "ProductVersion", "0.3.2"
        END
    END
    BLOCK "VarFileInfo"
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -1,15 +1,9 @@
-AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
-noinst_LTLIBRARIES =
-noinst_LTLIBRARIES += libwebpdsp.la
-noinst_LTLIBRARIES += libwebpdsp_avx2.la
-noinst_LTLIBRARIES += libwebpdsp_sse2.la
-noinst_LTLIBRARIES += libwebpdspdecode_sse2.la
-noinst_LTLIBRARIES += libwebpdsp_sse41.la
-noinst_LTLIBRARIES += libwebpdspdecode_sse41.la
-noinst_LTLIBRARIES += libwebpdsp_neon.la
-noinst_LTLIBRARIES += libwebpdspdecode_neon.la
-noinst_LTLIBRARIES += libwebpdsp_msa.la
-noinst_LTLIBRARIES += libwebpdspdecode_msa.la
+noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
+noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
+noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
+noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
+noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
+noinst_LTLIBRARIES += libwebpdspdecode_wasm.la

 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@@ -46,6 +40,8 @@ COMMON_SOURCES += yuv_mips32.c
 COMMON_SOURCES += yuv_mips_dsp_r2.c

 ENC_SOURCES =
+ENC_SOURCES += argb.c
+ENC_SOURCES += argb_mips_dsp_r2.c
 ENC_SOURCES += cost.c
 ENC_SOURCES += cost_mips32.c
 ENC_SOURCES += cost_mips_dsp_r2.c
@@ -101,7 +97,12 @@ libwebpdspdecode_msa_la_SOURCES += upsampling_msa.c
 libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
 libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)

+# WASM is not fully integrated into configure; the addition here keeps source
+# extraction by cmake simple.
+libwebpdspdecode_wasm_la_SOURCES = dec_wasm.c
+
 libwebpdsp_sse2_la_SOURCES =
+libwebpdsp_sse2_la_SOURCES += argb_sse2.c
 libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
 libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
@@ -142,8 +143,7 @@ libwebpdsp_la_CPPFLAGS += $(AM_CPPFLAGS)
 libwebpdsp_la_CPPFLAGS += $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
 libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_LIBADD =
-libwebpdsp_la_LIBADD += libwebpdsp_avx2.la
-libwebpdsp_la_LIBADD += libwebpdsp_sse2.la
+libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
 libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
 libwebpdsp_la_LIBADD += libwebpdsp_neon.la
 libwebpdsp_la_LIBADD += libwebpdsp_msa.la
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@@ -12,13 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 // Tables can be faster on some platform but incur some extra binary size (~2k).
-#if !defined(USE_TABLES_FOR_ALPHA_MULT)
-#define USE_TABLES_FOR_ALPHA_MULT 0   // ALTERNATE_CODE
-#endif
-
+// #define USE_TABLES_FOR_ALPHA_MULT

 // -----------------------------------------------------------------------------

@@ -32,7 +29,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
  return v;
 }

-#if (USE_TABLES_FOR_ALPHA_MULT == 1)
+#ifdef USE_TABLES_FOR_ALPHA_MULT

 static const uint32_t kMultTables[2][256] = {
  {    // (255u << MFIX) / alpha
@@ -135,9 +132,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
  return inverse ? (255u << MFIX) / a : a * KINV_255;
 }

-#endif  // USE_TABLES_FOR_ALPHA_MULT
+#endif    // USE_TABLES_FOR_ALPHA_MULT

-void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t argb = ptr[x];
@@ -157,8 +154,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
  }
 }

-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
-                   int width, int inverse) {
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t a = alpha[x];
@@ -220,9 +217,8 @@ void WebPMultRows(uint8_t* ptr, int stride,
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif

-#if !WEBP_NEON_OMIT_C_CODE
-static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
-                                 int w, int h, int stride) {
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
  while (h-- > 0) {
    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@@ -239,7 +235,6 @@ static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
    rgba += stride;
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE
 #undef MULTIPLIER
 #undef PREMULTIPLY

@@ -259,9 +254,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
  return (x * m) >> 16;
 }

-static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
-                                                 int w, int h, int stride,
-                                                 int rg_byte_pos /* 0 or 1 */) {
+static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
+                                               int w, int h, int stride,
+                                               int rg_byte_pos /* 0 or 1 */) {
  while (h-- > 0) {
    int i;
    for (i = 0; i < w; ++i) {
@@ -280,16 +275,15 @@ static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
 }
 #undef MULTIPLIER

-static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
-                                     int w, int h, int stride) {
-#if (WEBP_SWAP_16BIT_CSP == 1)
-  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
+static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
+                                   int w, int h, int stride) {
+#ifdef WEBP_SWAP_16BIT_CSP
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
 #else
-  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
 #endif
 }

-#if !WEBP_NEON_OMIT_C_CODE
 static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
                           int width, int height,
                           uint8_t* dst, int dst_stride) {
@@ -344,46 +338,6 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
  int i;
  for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE
-
-//------------------------------------------------------------------------------
-
-static int HasAlpha8b_C(const uint8_t* src, int length) {
-  while (length-- > 0) if (*src++ != 0xff) return 1;
-  return 0;
-}
-
-static int HasAlpha32b_C(const uint8_t* src, int length) {
-  int x;
-  for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
-  return 0;
-}
-
-//------------------------------------------------------------------------------
-// Simple channel manipulations.
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-#ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                       const uint8_t* b, int len, uint32_t* out) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-  }
-}
-#endif
-
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                      int len, int step, uint32_t* out) {
-  int i, offset = 0;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
-    offset += step;
-  }
-}

 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@@ -391,15 +345,6 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
-#ifdef WORDS_BIGENDIAN
-void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int, uint32_t*);
-#endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out);
-
-int (*WebPHasAlpha8b)(const uint8_t* src, int length);
-int (*WebPHasAlpha32b)(const uint8_t* src, int length);

 //------------------------------------------------------------------------------
 // Init function
@@ -415,24 +360,15 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;

-  WebPMultARGBRow = WebPMultARGBRow_C;
-  WebPMultRow = WebPMultRow_C;
-  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
+  WebPMultARGBRow = WebPMultARGBRowC;
+  WebPMultRow = WebPMultRowC;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;

-#ifdef WORDS_BIGENDIAN
-  WebPPackARGB = PackARGB_C;
-#endif
-  WebPPackRGB = PackRGB_C;
-#if !WEBP_NEON_OMIT_C_CODE
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
  WebPDispatchAlpha = DispatchAlpha_C;
  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
  WebPExtractAlpha = ExtractAlpha_C;
  WebPExtractGreen = ExtractGreen_C;
-#endif
-
-  WebPHasAlpha8b = HasAlpha8b_C;
-  WebPHasAlpha32b = HasAlpha32b_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@@ -446,34 +382,16 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
    }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitAlphaProcessingNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitAlphaProcessingMIPSdspR2();
    }
 #endif
  }
-
-#if defined(WEBP_USE_NEON)
-  if (WEBP_NEON_OMIT_C_CODE ||
-      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
-    WebPInitAlphaProcessingNEON();
-  }
-#endif
-
-  assert(WebPMultARGBRow != NULL);
-  assert(WebPMultRow != NULL);
-  assert(WebPApplyAlphaMultiply != NULL);
-  assert(WebPApplyAlphaMultiply4444 != NULL);
-  assert(WebPDispatchAlpha != NULL);
-  assert(WebPDispatchAlphaToGreen != NULL);
-  assert(WebPExtractAlpha != NULL);
-  assert(WebPExtractGreen != NULL);
-#ifdef WORDS_BIGENDIAN
-  assert(WebPPackARGB != NULL);
-#endif
-  assert(WebPPackRGB != NULL);
-  assert(WebPHasAlpha8b != NULL);
-  assert(WebPHasAlpha32b != NULL);
-
  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/src/dsp/alpha_processing_mips_dsp_r2.c
@@ -12,13 +12,13 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut  (djordje.pesut@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
-                                   int width, int height,
-                                   uint8_t* dst, int dst_stride) {
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
  uint32_t alpha_mask = 0xffffffff;
  int i, j, temp0;

@@ -79,8 +79,7 @@ static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
  return (alpha_mask != 0xff);
 }

-static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
-                                  int inverse) {
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x;
  const uint32_t c_00ffffff = 0x00ffffffu;
  const uint32_t c_ff000000 = 0xff000000u;
@@ -125,100 +124,14 @@ static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
  }
 }

-#ifdef WORDS_BIGENDIAN
-static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
-                               const uint8_t* g, const uint8_t* b, int len,
-                               uint32_t* out) {
-  int temp0, temp1, temp2, temp3, offset;
-  const int rest = len & 1;
-  const uint32_t* const loop_end = out + len - rest;
-  const int step = 4;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-#endif  // WORDS_BIGENDIAN
-
-static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
-                              const uint8_t* b, int len, int step,
-                              uint32_t* out) {
-  int temp0, temp1, temp2, offset;
-  const int rest = len & 1;
-  const int a = 0xff;
-  const uint32_t* const loop_end = out + len - rest;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
 //------------------------------------------------------------------------------
 // Entry point

 extern void WebPInitAlphaProcessingMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
-  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
-  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
-#ifdef WORDS_BIGENDIAN
-  WebPPackARGB = PackARGB_MIPSdspR2;
-#endif
-  WebPPackRGB = PackRGB_MIPSdspR2;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPMultARGBRow = MultARGBRow;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/alpha_processing_neon.c
+++ b/src/dsp/alpha_processing_neon.c
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_NEON)

-#include "src/dsp/neon.h"
+#include "./neon.h"

 //------------------------------------------------------------------------------

@@ -83,7 +83,7 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
 static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
                              int width, int height,
                              uint8_t* dst, int dst_stride) {
-  uint32_t alpha_mask = 0xffu;
+  uint32_t alpha_mask = 0xffffffffu;
  uint8x8_t mask8 = vdup_n_u8(0xff);
  uint32_t tmp[2];
  int i, j;
@@ -107,7 +107,6 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
    dst += dst_stride;
  }
  vst1_u8((uint8_t*)tmp, mask8);
-  alpha_mask *= 0x01010101;
  alpha_mask &= tmp[0];
  alpha_mask &= tmp[1];
  return (alpha_mask != 0xffffffffu);
@@ -135,7 +134,7 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
 static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
                             int width, int height,
                             uint8_t* alpha, int alpha_stride) {
-  uint32_t alpha_mask = 0xffu;
+  uint32_t alpha_mask = 0xffffffffu;
  uint8x8_t mask8 = vdup_n_u8(0xff);
  uint32_t tmp[2];
  int i, j;
@@ -157,7 +156,6 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
    alpha += alpha_stride;
  }
  vst1_u8((uint8_t*)tmp, mask8);
-  alpha_mask *= 0x01010101;
  alpha_mask &= tmp[0];
  alpha_mask &= tmp[1];
  return (alpha_mask == 0xffffffffu);
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>

 //------------------------------------------------------------------------------

-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
@@ -72,9 +72,9 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
  return (alpha_and != 0xff);
 }

-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
  int i, j;
  const __m128i zero = _mm_setzero_si128();
  const int limit = width & ~15;
@@ -98,9 +98,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
  }
 }

-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
-                             int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
@@ -210,61 +210,6 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
 #undef MULTIPLIER
 #undef PREMULTIPLY

-//------------------------------------------------------------------------------
-// Alpha detection
-
-static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
-  const __m128i all_0xff = _mm_set1_epi8(0xff);
-  int i = 0;
-  for (; i + 16 <= length; i += 16) {
-    const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
-    const int mask = _mm_movemask_epi8(bits);
-    if (mask != 0xffff) return 1;
-  }
-  for (; i < length; ++i) if (src[i] != 0xff) return 1;
-  return 0;
-}
-
-static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
-  const __m128i alpha_mask = _mm_set1_epi32(0xff);
-  const __m128i all_0xff = _mm_set1_epi8(0xff);
-  int i = 0;
-  // We don't know if we can access the last 3 bytes after the last alpha
-  // value 'src[4 * length - 4]' (because we don't know if alpha is the first
-  // or the last byte of the quadruplet). Hence the '-3' protection below.
-  length = length * 4 - 3;   // size in bytes
-  for (; i + 64 <= length; i += 64) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
-    const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
-    const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
-    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
-    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
-    const __m128i b2 = _mm_and_si128(a2, alpha_mask);
-    const __m128i b3 = _mm_and_si128(a3, alpha_mask);
-    const __m128i c0 = _mm_packs_epi32(b0, b1);
-    const __m128i c1 = _mm_packs_epi32(b2, b3);
-    const __m128i d  = _mm_packus_epi16(c0, c1);
-    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
-    const int mask = _mm_movemask_epi8(bits);
-    if (mask != 0xffff) return 1;
-  }
-  for (; i + 32 <= length; i += 32) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
-    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
-    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
-    const __m128i c  = _mm_packs_epi32(b0, b1);
-    const __m128i d  = _mm_packus_epi16(c, c);
-    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
-    const int mask = _mm_movemask_epi8(bits);
-    if (mask != 0xffff) return 1;
-  }
-  for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
-  return 0;
-}
-
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows

@@ -293,7 +238,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
    }
  }
  width -= x;
-  if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
+  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
 }

 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@@ -316,7 +261,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
    }
  }
  width -= x;
-  if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
+  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
 }

 //------------------------------------------------------------------------------
@@ -328,12 +273,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
  WebPMultARGBRow = MultARGBRow_SSE2;
  WebPMultRow = MultRow_SSE2;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
-  WebPDispatchAlpha = DispatchAlpha_SSE2;
-  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
-  WebPExtractAlpha = ExtractAlpha_SSE2;
-
-  WebPHasAlpha8b = HasAlpha8b_SSE2;
-  WebPHasAlpha32b = HasAlpha32b_SSE2;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
+  WebPExtractAlpha = ExtractAlpha;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/alpha_processing_sse41.c
+++ b/src/dsp/alpha_processing_sse41.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE41)

@@ -19,9 +19,9 @@

 //------------------------------------------------------------------------------

-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
-                              int width, int height,
-                              uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
@@ -82,7 +82,7 @@ static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
 extern void WebPInitAlphaProcessingSSE41(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
-  WebPExtractAlpha = ExtractAlpha_SSE41;
+  WebPExtractAlpha = ExtractAlpha;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/argb.c
+++ b/src/dsp/argb.c
@@ -0,0 +1,68 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions.
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
+
+void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                    const uint8_t*, int, uint32_t*);
+void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                   int, int, uint32_t*);
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+extern void VP8EncDspARGBInitSSE2(void);
+
+static volatile VP8CPUInfo argb_last_cpuinfo_used =
+    (VP8CPUInfo)&argb_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
+  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspARGBInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspARGBInitMIPSdspR2();
+    }
+#endif
+  }
+  argb_last_cpuinfo_used = VP8GetCPUInfo;
+}
--- a/src/dsp/argb_mips_dsp_r2.c
+++ b/src/dsp/argb_mips_dsp_r2.c
@@ -0,0 +1,110 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (mips version).
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/argb_sse2.c
+++ b/src/dsp/argb_sse2.c
@@ -0,0 +1,53 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (SSE2 version).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+#include "./lossless.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  (void)a;
+  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
+    assert(b == r + 2);
+    assert(a == r + 3);
+    VP8LConvertBGRAToRGBA((const uint32_t*)r, len, (uint8_t*)out);
+  } else {
+    assert(g == b + 1);
+    assert(r == b + 2);
+    assert(a == b + 3);
+    memcpy(out, b, len * 4);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitSSE2(void);
+extern void VP8LDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
+  VP8LDspInitSSE2();
+  VP8PackARGB = PackARGB;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@@ -9,8 +9,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
-#include "src/enc/cost_enc.h"
+#include "./dsp.h"
+#include "../enc/cost_enc.h"

 //------------------------------------------------------------------------------
 // Boolean-cost cost table
@@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
 //------------------------------------------------------------------------------
 // Mode costs

-static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
@@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
  return cost;
 }

-static void SetResidualCoeffs_C(const int16_t* const coeffs,
-                                VP8Residual* const res) {
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
  int n;
  res->last = -1;
  assert(res->first == 0 || coeffs[0] == 0);
@@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;

-  VP8GetResidualCost = GetResidualCost_C;
-  VP8SetResidualCoeffs = SetResidualCoeffs_C;
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS32)

-#include "src/enc/cost_enc.h"
+#include "../enc/cost_enc.h"

-static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  int temp0, temp1;
  int v_reg, ctx_reg;
  int n = res->first;
@@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
  return cost;
 }

-static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
-                                     VP8Residual* const res) {
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
  const int16_t* p_coeffs = (int16_t*)coeffs;
  int temp0, temp1, temp2, n, n1;
  assert(res->first == 0 || coeffs[0] == 0);
@@ -143,8 +143,8 @@ static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
 extern void VP8EncDspCostInitMIPS32(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
-  VP8GetResidualCost = GetResidualCost_MIPS32;
-  VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
 }

 #else  // !WEBP_USE_MIPS32
--- a/src/dsp/cost_mips_dsp_r2.c
+++ b/src/dsp/cost_mips_dsp_r2.c
@@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "src/enc/cost_enc.h"
+#include "../enc/cost_enc.h"

-static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  int temp0, temp1;
  int v_reg, ctx_reg;
  int n = res->first;
@@ -97,7 +97,7 @@ static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
-  VP8GetResidualCost = GetResidualCost_MIPSdspR2;
+  VP8GetResidualCost = GetResidualCost;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@@ -11,19 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>

-#include "src/enc/cost_enc.h"
-#include "src/enc/vp8i_enc.h"
-#include "src/utils/utils.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------

-static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
-                                   VP8Residual* const res) {
+static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                  VP8Residual* const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE2 to compare 16 values with a single instruction.
@@ -42,7 +42,7 @@ static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
  res->coeffs = coeffs;
 }

-static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
+static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
  uint8_t levels[16], ctxs[16];
  uint16_t abs_levels[16];
  int n = res->first;
@@ -108,8 +108,8 @@ static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
-  VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
-  VP8GetResidualCost = GetResidualCost_SSE2;
+  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
+  VP8GetResidualCost = GetResidualCostSSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
@@ -23,11 +23,13 @@
 #endif

 //------------------------------------------------------------------------------
-// SSE2 detection.
+// x86/x86-64 micro-arch detection.
 //

+// skip x86 specific code for WASM builds
+#if defined(WEBP_USE_WASM)
 // apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
-#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+#elif (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "mov %%ebx, %%edi\n"
@@ -63,8 +65,10 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
 #define GetCPUInfo __cpuid
 #endif

+// skip xgetbv definition for WASM builds
+#if defined(WEBP_USE_WASM)
 // NaCl has no support for xgetbv or the raw opcode.
-#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+#elif !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
 static WEBP_INLINE uint64_t xgetbv(void) {
  const uint32_t ecx = 0;
  uint32_t eax, edx;
@@ -94,7 +98,19 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif

-#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+//------------------------------------------------------------------------------
+// Platform specific VP8CPUInfo functions.
+//
+
+// WASM needs to precede platform specific architecture checks as the defines
+// will still be present when building this target.
+#if defined(WEBP_USE_WASM)
+static int wasmCPUInfo(CPUFeature feature) {
+  if (feature != kWASM) return 0;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
+#elif defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)

 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
@@ -143,7 +159,7 @@ static int x86CPUInfo(CPUFeature feature) {
    return !!(cpu_info[2] & (1 << 0));
  }
  if (feature == kSlowSSSE3) {
-    if (is_intel && (cpu_info[2] & (1 << 9))) {   // SSSE3?
+    if (is_intel && (cpu_info[2] & (1 << 0))) {   // SSSE3?
      return CheckSlowModel(cpu_info[0]);
    }
    return 0;
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -11,11 +11,9 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include <assert.h>
-
-#include "src/dsp/dsp.h"
-#include "src/dec/vp8i_dec.h"
-#include "src/utils/utils.h"
+#include "./dsp.h"
+#include "../dec/vp8i_dec.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------

@@ -27,7 +25,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 // Transforms (Paragraph 14.4)

 #define STORE(x, y, v) \
-  dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
+  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))

 #define STORE2(y, dc, d, c) do {    \
  const int DC = (dc);              \
@@ -40,8 +38,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define MUL1(a) ((((a) * 20091) >> 16) + (a))
 #define MUL2(a) (((a) * 35468) >> 16)

-#if !WEBP_NEON_OMIT_C_CODE
-static void TransformOne_C(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@@ -81,7 +78,7 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
 }

 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
  const int c4 = MUL2(in[4]);
  const int d4 = MUL1(in[4]);
@@ -96,21 +93,19 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
 #undef MUL2
 #undef STORE2

-static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne_C(in, dst);
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
  if (do_two) {
-    TransformOne_C(in + 16, dst + 4);
+    TransformOne(in + 16, dst + 4);
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void TransformUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformUV(const int16_t* in, uint8_t* dst) {
  VP8Transform(in + 0 * 16, dst, 1);
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

-#if !WEBP_NEON_OMIT_C_CODE
-static void TransformDC_C(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
@@ -119,9 +114,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
    }
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV(const int16_t* in, uint8_t* dst) {
  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@@ -133,8 +127,7 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // Paragraph 14.3

-#if !WEBP_NEON_OMIT_C_CODE
-static void TransformWHT_C(const int16_t* in, int16_t* out) {
+static void TransformWHT(const int16_t* in, int16_t* out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
@@ -160,7 +153,6 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
    out += 64;
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

@@ -169,7 +161,6 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

 #define DST(x, y) dst[(x) + (y) * BPS]

-#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = VP8kclip1 - top[-1];
@@ -183,21 +174,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
    dst += BPS;
  }
 }
-static void TM4_C(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16_C(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }

 //------------------------------------------------------------------------------
 // 16x16

-static void VE16_C(uint8_t* dst) {     // vertical
+static void VE16(uint8_t* dst) {     // vertical
  int j;
  for (j = 0; j < 16; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 16);
  }
 }

-static void HE16_C(uint8_t* dst) {     // horizontal
+static void HE16(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    memset(dst, dst[-1], 16);
@@ -212,7 +203,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  }
 }

-static void DC16_C(uint8_t* dst) {    // DC
+static void DC16(uint8_t* dst) {    // DC
  int DC = 16;
  int j;
  for (j = 0; j < 16; ++j) {
@@ -221,7 +212,7 @@ static void DC16_C(uint8_t* dst) {    // DC
  Put16(DC >> 5, dst);
 }

-static void DC16NoTop_C(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
@@ -230,7 +221,7 @@ static void DC16NoTop_C(uint8_t* dst) {   // DC with top samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoLeft_C(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
  int DC = 8;
  int i;
  for (i = 0; i < 16; ++i) {
@@ -239,10 +230,9 @@ static void DC16NoLeft_C(uint8_t* dst) {  // DC with left samples not available
  Put16(DC >> 4, dst);
 }

-static void DC16NoTopLeft_C(uint8_t* dst) {  // DC with no top and left samples
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];

@@ -252,8 +242,7 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

-#if !WEBP_NEON_OMIT_C_CODE
-static void VE4_C(uint8_t* dst) {    // vertical
+static void VE4(uint8_t* dst) {    // vertical
  const uint8_t* top = dst - BPS;
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
@@ -266,9 +255,8 @@ static void VE4_C(uint8_t* dst) {    // vertical
    memcpy(dst + i * BPS, vals, sizeof(vals));
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void HE4_C(uint8_t* dst) {    // horizontal
+static void HE4(uint8_t* dst) {    // horizontal
  const int A = dst[-1 - BPS];
  const int B = dst[-1];
  const int C = dst[-1 + BPS];
@@ -280,8 +268,7 @@ static void HE4_C(uint8_t* dst) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }

-#if !WEBP_NEON_OMIT_C_CODE
-static void DC4_C(uint8_t* dst) {   // DC
+static void DC4(uint8_t* dst) {   // DC
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@@ -289,7 +276,7 @@ static void DC4_C(uint8_t* dst) {   // DC
  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }

-static void RD4_C(uint8_t* dst) {   // Down-right
+static void RD4(uint8_t* dst) {   // Down-right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@@ -308,7 +295,7 @@ static void RD4_C(uint8_t* dst) {   // Down-right
                                      DST(3, 0) = AVG3(D, C, B);
 }

-static void LD4_C(uint8_t* dst) {   // Down-Left
+static void LD4(uint8_t* dst) {   // Down-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@@ -325,9 +312,8 @@ static void LD4_C(uint8_t* dst) {   // Down-Left
                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
                                      DST(3, 3) = AVG3(G, H, H);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void VR4_C(uint8_t* dst) {   // Vertical-Right
+static void VR4(uint8_t* dst) {   // Vertical-Right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@@ -349,7 +335,7 @@ static void VR4_C(uint8_t* dst) {   // Vertical-Right
  DST(3, 1) =             AVG3(B, C, D);
 }

-static void VL4_C(uint8_t* dst) {   // Vertical-Left
+static void VL4(uint8_t* dst) {   // Vertical-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
@@ -371,7 +357,7 @@ static void VL4_C(uint8_t* dst) {   // Vertical-Left
              DST(3, 3) = AVG3(F, G, H);
 }

-static void HU4_C(uint8_t* dst) {   // Horizontal-Up
+static void HU4(uint8_t* dst) {   // Horizontal-Up
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@@ -386,7 +372,7 @@ static void HU4_C(uint8_t* dst) {   // Horizontal-Up
    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static void HD4_C(uint8_t* dst) {  // Horizontal-Down
+static void HD4(uint8_t* dst) {  // Horizontal-Down
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
@@ -418,15 +404,14 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
 //------------------------------------------------------------------------------
 // Chroma

-#if !WEBP_NEON_OMIT_C_CODE
-static void VE8uv_C(uint8_t* dst) {    // vertical
+static void VE8uv(uint8_t* dst) {    // vertical
  int j;
  for (j = 0; j < 8; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 8);
  }
 }

-static void HE8uv_C(uint8_t* dst) {    // horizontal
+static void HE8uv(uint8_t* dst) {    // horizontal
  int j;
  for (j = 0; j < 8; ++j) {
    memset(dst, dst[-1], 8);
@@ -442,7 +427,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  }
 }

-static void DC8uv_C(uint8_t* dst) {     // DC
+static void DC8uv(uint8_t* dst) {     // DC
  int dc0 = 8;
  int i;
  for (i = 0; i < 8; ++i) {
@@ -451,7 +436,7 @@ static void DC8uv_C(uint8_t* dst) {     // DC
  Put8x8uv(dc0 >> 4, dst);
 }

-static void DC8uvNoLeft_C(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@@ -460,7 +445,7 @@ static void DC8uvNoLeft_C(uint8_t* dst) {   // DC with no left samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTop_C(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
@@ -469,19 +454,17 @@ static void DC8uvNoTop_C(uint8_t* dst) {  // DC with no top samples
  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTopLeft_C(uint8_t* dst) {    // DC with nothing
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
  Put8x8uv(0x80, dst);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];

 //------------------------------------------------------------------------------
 // Edge filtering functions

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 // 4 pixels in, 2 pixels out
-static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
@@ -491,7 +474,7 @@ static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
 }

 // 4 pixels in, 4 pixels out
-static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
  const int a1 = VP8ksclip2[(a + 4) >> 3];
@@ -504,7 +487,7 @@ static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
 }

 // 6 pixels in, 6 pixels out
-static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@@ -520,22 +503,18 @@ static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
  p[ 2*step] = VP8kclip1[q2 - a3];
 }

-static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

-#if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
-                                      int step, int t, int it) {
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
  const int p0 = p[-step], q0 = p[0];
  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@@ -544,159 +523,140 @@ static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

-#if !WEBP_NEON_OMIT_C_CODE
-static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (NeedsFilter_C(p + i, stride, thresh2)) {
-      DoFilter2_C(p + i, stride);
+    if (needs_filter(p + i, stride, thresh2)) {
+      do_filter2(p + i, stride);
    }
  }
 }

-static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
-    if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
-      DoFilter2_C(p + i * stride, 1);
+    if (needs_filter(p + i * stride, 1, thresh2)) {
+      do_filter2(p + i * stride, 1);
    }
  }
 }

-static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    SimpleVFilter16_C(p, stride, thresh);
+    SimpleVFilter16(p, stride, thresh);
  }
 }

-static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    SimpleHFilter16_C(p, stride, thresh);
+    SimpleHFilter16(p, stride, thresh);
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
-                                       int hstride, int vstride, int size,
-                                       int thresh, int ithresh,
-                                       int hev_thresh) {
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
-      if (Hev(p, hstride, hev_thresh)) {
-        DoFilter2_C(p, hstride);
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
      } else {
-        DoFilter6_C(p, hstride);
+        do_filter6(p, hstride);
      }
    }
    p += vstride;
  }
 }

-static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
-                                       int hstride, int vstride, int size,
-                                       int thresh, int ithresh,
-                                       int hev_thresh) {
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
-    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
-      if (Hev(p, hstride, hev_thresh)) {
-        DoFilter2_C(p, hstride);
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
      } else {
-        DoFilter4_C(p, hstride);
+        do_filter4(p, hstride);
      }
    }
    p += vstride;
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

-#if !WEBP_NEON_OMIT_C_CODE
 // on macroblock edges
-static void VFilter16_C(uint8_t* p, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
 }

-static void HFilter16_C(uint8_t* p, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
 }

 // on three inner edges
-static void VFilter16i_C(uint8_t* p, int stride,
-                         int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter16i_C(uint8_t* p, int stride,
-                         int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

-#if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

-#if !WEBP_NEON_OMIT_C_CODE
-static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------

-static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
-                               int dst_stride) {
+static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
+                             int dst_stride) {
  int i, j;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) {
@@ -740,6 +700,7 @@ extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);
+extern void VP8DspInitWASM(void);

 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
@@ -749,66 +710,54 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {

  VP8InitClipTables();

-#if !WEBP_NEON_OMIT_C_CODE
-  VP8TransformWHT = TransformWHT_C;
-  VP8Transform = TransformTwo_C;
-  VP8TransformDC = TransformDC_C;
-  VP8TransformAC3 = TransformAC3_C;
-#endif
-  VP8TransformUV = TransformUV_C;
-  VP8TransformDCUV = TransformDCUV_C;
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformUV = TransformUV;
+  VP8TransformDC = TransformDC;
+  VP8TransformDCUV = TransformDCUV;
+  VP8TransformAC3 = TransformAC3;

-#if !WEBP_NEON_OMIT_C_CODE
-  VP8VFilter16 = VFilter16_C;
-  VP8VFilter16i = VFilter16i_C;
-  VP8HFilter16 = HFilter16_C;
-  VP8VFilter8 = VFilter8_C;
-  VP8VFilter8i = VFilter8i_C;
-  VP8SimpleVFilter16 = SimpleVFilter16_C;
-  VP8SimpleHFilter16 = SimpleHFilter16_C;
-  VP8SimpleVFilter16i = SimpleVFilter16i_C;
-  VP8SimpleHFilter16i = SimpleHFilter16i_C;
-#endif
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-  VP8HFilter16i = HFilter16i_C;
-  VP8HFilter8 = HFilter8_C;
-  VP8HFilter8i = HFilter8i_C;
-#endif
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[3] = HE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+  VP8PredLuma4[8] = HD4;
+  VP8PredLuma4[9] = HU4;

-#if !WEBP_NEON_OMIT_C_CODE
-  VP8PredLuma4[0] = DC4_C;
-  VP8PredLuma4[1] = TM4_C;
-  VP8PredLuma4[2] = VE4_C;
-  VP8PredLuma4[4] = RD4_C;
-  VP8PredLuma4[6] = LD4_C;
-#endif
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;

-  VP8PredLuma4[3] = HE4_C;
-  VP8PredLuma4[5] = VR4_C;
-  VP8PredLuma4[7] = VL4_C;
-  VP8PredLuma4[8] = HD4_C;
-  VP8PredLuma4[9] = HU4_C;
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;

-#if !WEBP_NEON_OMIT_C_CODE
-  VP8PredLuma16[0] = DC16_C;
-  VP8PredLuma16[1] = TM16_C;
-  VP8PredLuma16[2] = VE16_C;
-  VP8PredLuma16[3] = HE16_C;
-  VP8PredLuma16[4] = DC16NoTop_C;
-  VP8PredLuma16[5] = DC16NoLeft_C;
-  VP8PredLuma16[6] = DC16NoTopLeft_C;
-
-  VP8PredChroma8[0] = DC8uv_C;
-  VP8PredChroma8[1] = TM8uv_C;
-  VP8PredChroma8[2] = VE8uv_C;
-  VP8PredChroma8[3] = HE8uv_C;
-  VP8PredChroma8[4] = DC8uvNoTop_C;
-  VP8PredChroma8[5] = DC8uvNoLeft_C;
-  VP8PredChroma8[6] = DC8uvNoTopLeft_C;
-#endif
-
-  VP8DitherCombine8x8 = DitherCombine8x8_C;
+  VP8DitherCombine8x8 = DitherCombine8x8;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@@ -822,6 +771,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 #endif
    }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8DspInitNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
@@ -837,58 +791,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
      VP8DspInitMSA();
    }
 #endif
-  }
-
-#if defined(WEBP_USE_NEON)
-  if (WEBP_NEON_OMIT_C_CODE ||
-      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
-    VP8DspInitNEON();
-  }
+#if defined(WEBP_USE_WASM)
+    if (VP8GetCPUInfo(kWASM)) {
+      VP8DspInitWASM();
+    }
 #endif
-
-  assert(VP8TransformWHT != NULL);
-  assert(VP8Transform != NULL);
-  assert(VP8TransformDC != NULL);
-  assert(VP8TransformAC3 != NULL);
-  assert(VP8TransformUV != NULL);
-  assert(VP8TransformDCUV != NULL);
-  assert(VP8VFilter16 != NULL);
-  assert(VP8HFilter16 != NULL);
-  assert(VP8VFilter8 != NULL);
-  assert(VP8HFilter8 != NULL);
-  assert(VP8VFilter16i != NULL);
-  assert(VP8HFilter16i != NULL);
-  assert(VP8VFilter8i != NULL);
-  assert(VP8HFilter8i != NULL);
-  assert(VP8SimpleVFilter16 != NULL);
-  assert(VP8SimpleHFilter16 != NULL);
-  assert(VP8SimpleVFilter16i != NULL);
-  assert(VP8SimpleHFilter16i != NULL);
-  assert(VP8PredLuma4[0] != NULL);
-  assert(VP8PredLuma4[1] != NULL);
-  assert(VP8PredLuma4[2] != NULL);
-  assert(VP8PredLuma4[3] != NULL);
-  assert(VP8PredLuma4[4] != NULL);
-  assert(VP8PredLuma4[5] != NULL);
-  assert(VP8PredLuma4[6] != NULL);
-  assert(VP8PredLuma4[7] != NULL);
-  assert(VP8PredLuma4[8] != NULL);
-  assert(VP8PredLuma4[9] != NULL);
-  assert(VP8PredLuma16[0] != NULL);
-  assert(VP8PredLuma16[1] != NULL);
-  assert(VP8PredLuma16[2] != NULL);
-  assert(VP8PredLuma16[3] != NULL);
-  assert(VP8PredLuma16[4] != NULL);
-  assert(VP8PredLuma16[5] != NULL);
-  assert(VP8PredLuma16[6] != NULL);
-  assert(VP8PredChroma8[0] != NULL);
-  assert(VP8PredChroma8[1] != NULL);
-  assert(VP8PredChroma8[2] != NULL);
-  assert(VP8PredChroma8[3] != NULL);
-  assert(VP8PredChroma8[4] != NULL);
-  assert(VP8PredChroma8[5] != NULL);
-  assert(VP8PredChroma8[6] != NULL);
-  assert(VP8DitherCombine8x8 != NULL);
-
+  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/dec_clip_tables.c
+++ b/src/dsp/dec_clip_tables.c
@@ -11,14 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

-// define to 0 to have run-time table initialization
-#if !defined(USE_STATIC_TABLES)
-#define USE_STATIC_TABLES 1   // ALTERNATE_CODE
-#endif
+#define USE_STATIC_TABLES     // undefine to have run-time table initialization

-#if (USE_STATIC_TABLES == 1)
+#ifdef USE_STATIC_TABLES

 static const uint8_t abs0[255 + 255 + 1] = {
  0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@@ -340,7 +337,7 @@ static uint8_t clip1[255 + 511 + 1];
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;

-#endif    // USE_STATIC_TABLES
+#endif

 const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
 const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@@ -348,7 +345,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];

 WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
-#if (USE_STATIC_TABLES == 0)
+#if !defined(USE_STATIC_TABLES)
  int i;
  if (!tables_ok) {
    for (i = -255; i <= 255; ++i) {
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS32)

-#include "src/dsp/mips_macro.h"
+#include "./mips_macro.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "src/dsp/mips_macro.h"
+#include "./mips_macro.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@@ -12,11 +12,11 @@
 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)


-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MSA)

-#include "src/dsp/msa_macro.h"
+#include "./msa_macro.h"

 //------------------------------------------------------------------------------
 // Transforms
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@@ -12,25 +12,23 @@
 // Author: somnath@google.com (Somnath Banerjee)
 //         cduvivier@google.com (Christian Duvivier)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE2)

 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
 // one it seems => disable it by default. Uncomment the following to enable:
-#if !defined(USE_TRANSFORM_AC3)
-#define USE_TRANSFORM_AC3 0   // ALTERNATE_CODE
-#endif
+// #define USE_TRANSFORM_AC3

 #include <emmintrin.h>
-#include "src/dsp/common_sse2.h"
-#include "src/dec/vp8i_dec.h"
-#include "src/utils/utils.h"
+#include "./common_sse2.h"
+#include "../dec/vp8i_dec.h"
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

-static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -195,7 +193,7 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
  }
 }

-#if (USE_TRANSFORM_AC3 == 1)
+#if defined(USE_TRANSFORM_AC3)
 #define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  static const int kC1 = 20091 + (1 << 16);
@@ -250,7 +248,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
    _mm_subs_epu8((p), (q)))

 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
-static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
+static WEBP_INLINE void SignedShift8b(__m128i* const x) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
  const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@@ -260,8 +258,8 @@ static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
 }

 #define FLIP_SIGN_BIT2(a, b) {                                                 \
-  (a) = _mm_xor_si128(a, sign_bit);                                            \
-  (b) = _mm_xor_si128(b, sign_bit);                                            \
+  a = _mm_xor_si128(a, sign_bit);                                              \
+  b = _mm_xor_si128(b, sign_bit);                                              \
 }

 #define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
@@ -270,11 +268,11 @@ static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
 }

 // input/output is uint8_t
-static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
-                                       const __m128i* const p0,
-                                       const __m128i* const q0,
-                                       const __m128i* const q1,
-                                       int hev_thresh, __m128i* const not_hev) {
+static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
+                                  const __m128i* const p0,
+                                  const __m128i* const q0,
+                                  const __m128i* const q1,
+                                  int hev_thresh, __m128i* const not_hev) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i t_1 = MM_ABS(*p1, *p0);
  const __m128i t_2 = MM_ABS(*q1, *q0);
@@ -287,11 +285,11 @@ static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
 }

 // input pixels are int8_t
-static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
-                                          const __m128i* const p0,
-                                          const __m128i* const q0,
-                                          const __m128i* const q1,
-                                          __m128i* const delta) {
+static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
+                                     const __m128i* const p0,
+                                     const __m128i* const q0,
+                                     const __m128i* const q1,
+                                     __m128i* const delta) {
  // beware of addition order, for saturation!
  const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
  const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
@@ -302,16 +300,15 @@ static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
 }

 // input and output are int8_t
-static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
-                                            __m128i* const q0,
-                                            const __m128i* const fl) {
+static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
+                                       const __m128i* const fl) {
  const __m128i k3 = _mm_set1_epi8(3);
  const __m128i k4 = _mm_set1_epi8(4);
  __m128i v3 = _mm_adds_epi8(*fl, k3);
  __m128i v4 = _mm_adds_epi8(*fl, k4);

-  SignedShift8b_SSE2(&v4);             // v4 >> 3
-  SignedShift8b_SSE2(&v3);             // v3 >> 3
+  SignedShift8b(&v4);                  // v4 >> 3
+  SignedShift8b(&v3);                  // v3 >> 3
  *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
  *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
 }
@@ -320,9 +317,9 @@ static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
-static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
-                                           const __m128i* const a0_lo,
-                                           const __m128i* const a0_hi) {
+static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
+                                      const __m128i* const a0_lo,
+                                      const __m128i* const a0_hi) {
  const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
  const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
  const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@@ -333,11 +330,11 @@ static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
 }

 // input pixels are uint8_t
-static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
-                                         const __m128i* const p0,
-                                         const __m128i* const q0,
-                                         const __m128i* const q1,
-                                         int thresh, __m128i* const mask) {
+static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
+                                    const __m128i* const p0,
+                                    const __m128i* const q0,
+                                    const __m128i* const q1,
+                                    int thresh, __m128i* const mask) {
  const __m128i m_thresh = _mm_set1_epi8(thresh);
  const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
  const __m128i kFE = _mm_set1_epi8(0xFE);
@@ -356,29 +353,28 @@ static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
 // Edge filtering functions

 // Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
-                                       __m128i* const q0, __m128i* const q1,
-                                       int thresh) {
+static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
+                                  __m128i* const q0, __m128i* const q1,
+                                  int thresh) {
  __m128i a, mask;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
-  // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
+  // convert p1/q1 to int8_t (for GetBaseDelta)
  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);

-  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
+  NeedsFilter(p1, p0, q0, q1, thresh, &mask);

  FLIP_SIGN_BIT2(*p0, *q0);
-  GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
+  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
-  DoSimpleFilter_SSE2(p0, q0, &a);
+  DoSimpleFilter(p0, q0, &a);
  FLIP_SIGN_BIT2(*p0, *q0);
 }

 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
-                                       __m128i* const q0, __m128i* const q1,
-                                       const __m128i* const mask,
-                                       int hev_thresh) {
+static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
+                                  __m128i* const q0, __m128i* const q1,
+                                  const __m128i* const mask, int hev_thresh) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  const __m128i k64 = _mm_set1_epi8(64);
@@ -388,7 +384,7 @@ static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
  __m128i t1, t2, t3;

  // compute hev mask
-  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

  // convert to signed values
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@@ -403,8 +399,8 @@ static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,

  t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
  t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
-  SignedShift8b_SSE2(&t2);           // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
-  SignedShift8b_SSE2(&t3);           // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
  *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
  FLIP_SIGN_BIT2(*p0, *q0);
@@ -421,26 +417,25 @@ static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
 }

 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
-                                       __m128i* const p0, __m128i* const q0,
-                                       __m128i* const q1, __m128i* const q2,
-                                       const __m128i* const mask,
-                                       int hev_thresh) {
+static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
+                                  __m128i* const p0, __m128i* const q0,
+                                  __m128i* const q1, __m128i* const q2,
+                                  const __m128i* const mask, int hev_thresh) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  __m128i a, not_hev;

  // compute hev mask
-  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
-  GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
+  GetBaseDelta(p1, p0, q0, q1, &a);

  { // do simple filter on pixels with hev
    const __m128i m = _mm_andnot_si128(not_hev, *mask);
    const __m128i f = _mm_and_si128(a, m);
-    DoSimpleFilter_SSE2(p0, q0, &f);
+    DoSimpleFilter(p0, q0, &f);
  }

  { // do strong filter on pixels with not hev
@@ -465,15 +460,15 @@ static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
    const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
    const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63

-    Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
-    Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
-    Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
+    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
  }
 }

 // reads 8 rows across a vertical edge.
-static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
-                                     __m128i* const p, __m128i* const q) {
+static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
+                                __m128i* const p, __m128i* const q) {
  // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
  // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
  const __m128i A0 = _mm_set_epi32(
@@ -499,11 +494,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
  *q = _mm_unpackhi_epi32(C0, C1);
 }

-static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
-                                      const uint8_t* const r8,
-                                      int stride,
-                                      __m128i* const p1, __m128i* const p0,
-                                      __m128i* const q0, __m128i* const q1) {
+static WEBP_INLINE void Load16x4(const uint8_t* const r0,
+                                 const uint8_t* const r8,
+                                 int stride,
+                                 __m128i* const p1, __m128i* const p0,
+                                 __m128i* const q0, __m128i* const q1) {
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
@@ -519,8 +514,8 @@ static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-  Load8x4_SSE2(r0, stride, p1, q0);
-  Load8x4_SSE2(r8, stride, p0, q1);
+  Load8x4(r0, stride, p1, q0);
+  Load8x4(r8, stride, p0, q1);

  {
    // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@@ -536,8 +531,7 @@ static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
  }
 }

-static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
-                                      uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@@ -546,12 +540,12 @@ static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
 }

 // Transpose back and store
-static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
-                                       const __m128i* const p0,
-                                       const __m128i* const q0,
-                                       const __m128i* const q1,
-                                       uint8_t* r0, uint8_t* r8,
-                                       int stride) {
+static WEBP_INLINE void Store16x4(const __m128i* const p1,
+                                  const __m128i* const p0,
+                                  const __m128i* const q0,
+                                  const __m128i* const q1,
+                                  uint8_t* r0, uint8_t* r8,
+                                  int stride) {
  __m128i t1, p1_s, p0_s, q0_s, q1_s;

  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@@ -578,55 +572,55 @@ static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
  p1_s = _mm_unpacklo_epi16(t1, q1_s);
  q1_s = _mm_unpackhi_epi16(t1, q1_s);

-  Store4x4_SSE2(&p0_s, r0, stride);
+  Store4x4(&p0_s, r0, stride);
  r0 += 4 * stride;
-  Store4x4_SSE2(&q0_s, r0, stride);
+  Store4x4(&q0_s, r0, stride);

-  Store4x4_SSE2(&p1_s, r8, stride);
+  Store4x4(&p1_s, r8, stride);
  r8 += 4 * stride;
-  Store4x4_SSE2(&q1_s, r8, stride);
+  Store4x4(&q1_s, r8, stride);
 }

 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

-static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  // Load
  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);

-  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);

  // Store
  _mm_storeu_si128((__m128i*)&p[-stride], p0);
  _mm_storeu_si128((__m128i*)&p[0], q0);
 }

-static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  __m128i p1, p0, q0, q1;

  p -= 2;  // beginning of p1

-  Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
-  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
-  Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
+  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
 }

-static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
-    SimpleVFilter16_SSE2(p, stride, thresh);
+    SimpleVFilter16(p, stride, thresh);
  }
 }

-static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
-    SimpleHFilter16_SSE2(p, stride, thresh);
+    SimpleHFilter16(p, stride, thresh);
  }
 }

@@ -634,60 +628,60 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
 // Complex In-loop filtering (Paragraph 15.3)

 #define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
-  (m) = MM_ABS(p1, p0);                                                        \
-  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
-  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
+  m = MM_ABS(p1, p0);                                                          \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
 } while (0)

 #define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
-  (m) = _mm_max_epu8(m, MM_ABS(p1, p0));                                       \
-  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
-  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
 } while (0)

 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
-  (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]);                        \
-  (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]);                        \
-  (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]);                        \
-  (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]);                        \
+  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
+  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
+  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
+  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
 }

 #define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
  const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
-  (p) = _mm_unpacklo_epi64(U, V);                                              \
+  p = _mm_unpacklo_epi64(U, V);                                                \
 } while (0)

 #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
-  LOADUV_H_EDGE(e1, u, v, 0 * (stride));                                       \
-  LOADUV_H_EDGE(e2, u, v, 1 * (stride));                                       \
-  LOADUV_H_EDGE(e3, u, v, 2 * (stride));                                       \
-  LOADUV_H_EDGE(e4, u, v, 3 * (stride));                                       \
+  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
+  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
+  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
+  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
 }

 #define STOREUV(p, u, v, stride) {                                             \
-  _mm_storel_epi64((__m128i*)&(u)[(stride)], p);                               \
-  (p) = _mm_srli_si128(p, 8);                                                  \
-  _mm_storel_epi64((__m128i*)&(v)[(stride)], p);                               \
+  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
+  p = _mm_srli_si128(p, 8);                                                    \
+  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
 }

-static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
-                                         const __m128i* const p0,
-                                         const __m128i* const q0,
-                                         const __m128i* const q1,
-                                         int thresh, int ithresh,
-                                         __m128i* const mask) {
+static WEBP_INLINE void ComplexMask(const __m128i* const p1,
+                                    const __m128i* const p0,
+                                    const __m128i* const q0,
+                                    const __m128i* const q1,
+                                    int thresh, int ithresh,
+                                    __m128i* const mask) {
  const __m128i it = _mm_set1_epi8(ithresh);
  const __m128i diff = _mm_subs_epu8(*mask, it);
  const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
  __m128i filter_mask;
-  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
+  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
  *mask = _mm_and_si128(thresh_mask, filter_mask);
 }

 // on macroblock edges
-static void VFilter16_SSE2(uint8_t* p, int stride,
-                           int thresh, int ithresh, int hev_thresh) {
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
  __m128i t1;
  __m128i mask;
  __m128i p2, p1, p0, q0, q1, q2;
@@ -700,8 +694,8 @@ static void VFilter16_SSE2(uint8_t* p, int stride,
  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);

-  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

  // Store
  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@@ -712,28 +706,28 @@ static void VFilter16_SSE2(uint8_t* p, int stride,
  _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
 }

-static void HFilter16_SSE2(uint8_t* p, int stride,
-                           int thresh, int ithresh, int hev_thresh) {
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

  uint8_t* const b = p - 4;
-  Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
+  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
  MAX_DIFF1(p3, p2, p1, p0, mask);

-  Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
+  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
  MAX_DIFF2(q3, q2, q1, q0, mask);

-  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

-  Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
-  Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
+  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
 }

 // on three inner edges
-static void VFilter16i_SSE2(uint8_t* p, int stride,
-                            int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
  int k;
  __m128i p3, p2, p1, p0;   // loop invariants

@@ -750,8 +744,8 @@ static void VFilter16i_SSE2(uint8_t* p, int stride,

    // p3 and p2 are not just temporary variables here: they will be
    // re-used for next span. And q2/q3 will become p1/p0 accordingly.
-    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);

    // Store
    _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@@ -765,12 +759,12 @@ static void VFilter16i_SSE2(uint8_t* p, int stride,
  }
 }

-static void HFilter16i_SSE2(uint8_t* p, int stride,
-                            int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
  int k;
  __m128i p3, p2, p1, p0;   // loop invariants

-  Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
+  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue

  for (k = 3; k > 0; --k) {
    __m128i mask, tmp1, tmp2;
@@ -779,13 +773,13 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
    p += 4;  // beginning of q0 (and next span)

    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
-    Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);

-    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);

-    Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);

    // rotate samples
    p1 = tmp1;
@@ -794,8 +788,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
 }

 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, p2, p1, p0, q0, q1, q2;

@@ -807,8 +801,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);

-  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

  // Store
  STOREUV(p2, u, v, -3 * stride);
@@ -819,28 +813,28 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q2, u, v, 2 * stride);
 }

-static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

  uint8_t* const tu = u - 4;
  uint8_t* const tv = v - 4;
-  Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
+  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
  MAX_DIFF1(p3, p2, p1, p0, mask);

-  Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
+  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
  MAX_DIFF2(q3, q2, q1, q0, mask);

-  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

-  Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
-  Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
+  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
+  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
 }

-static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
-                           int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;

@@ -855,8 +849,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
  MAX_DIFF2(t2, t1, q1, q0, mask);

-  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

  // Store
  STOREUV(p1, u, v, -2 * stride);
@@ -865,24 +859,24 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q1, u, v, 1 * stride);
 }

-static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
-                           int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
-  Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
  MAX_DIFF1(t2, t1, p1, p0, mask);

  u += 4;  // beginning of q0
  v += 4;
-  Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
  MAX_DIFF2(t2, t1, q1, q0, mask);

-  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

  u -= 2;  // beginning of p1
  v -= 2;
-  Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
+  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
 }

 //------------------------------------------------------------------------------
@@ -899,7 +893,7 @@ static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1

-static void VE4_SSE2(uint8_t* dst) {    // vertical
+static void VE4(uint8_t* dst) {    // vertical
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -915,7 +909,7 @@ static void VE4_SSE2(uint8_t* dst) {    // vertical
  }
 }

-static void LD4_SSE2(uint8_t* dst) {   // Down-Left
+static void LD4(uint8_t* dst) {   // Down-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -931,7 +925,7 @@ static void LD4_SSE2(uint8_t* dst) {   // Down-Left
  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }

-static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
+static void VR4(uint8_t* dst) {   // Vertical-Right
  const __m128i one = _mm_set1_epi8(1);
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
@@ -956,7 +950,7 @@ static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
  DST(0, 3) = AVG3(K, J, I);
 }

-static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
+static void VL4(uint8_t* dst) {   // Vertical-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -981,7 +975,7 @@ static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
  DST(3, 3) = (extra_out >> 8) & 0xff;
 }

-static void RD4_SSE2(uint8_t* dst) {   // Down-right
+static void RD4(uint8_t* dst) {   // Down-right
  const __m128i one = _mm_set1_epi8(1);
  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
  const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@@ -1010,7 +1004,7 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
 //------------------------------------------------------------------------------
 // Luma 16x16

-static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const __m128i zero = _mm_setzero_si128();
  int y;
@@ -1047,11 +1041,11 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
  }
 }

-static void TM4_SSE2(uint8_t* dst)   { TrueMotion_SSE2(dst, 4); }
-static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
-static void TM16_SSE2(uint8_t* dst)  { TrueMotion_SSE2(dst, 16); }
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }

-static void VE16_SSE2(uint8_t* dst) {
+static void VE16(uint8_t* dst) {
  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
  int j;
  for (j = 0; j < 16; ++j) {
@@ -1059,7 +1053,7 @@ static void VE16_SSE2(uint8_t* dst) {
  }
 }

-static void HE16_SSE2(uint8_t* dst) {     // horizontal
+static void HE16(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    const __m128i values = _mm_set1_epi8(dst[-1]);
@@ -1068,7 +1062,7 @@ static void HE16_SSE2(uint8_t* dst) {     // horizontal
  }
 }

-static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 16; ++j) {
@@ -1076,7 +1070,7 @@ static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
  }
 }

-static void DC16_SSE2(uint8_t* dst) {  // DC
+static void DC16(uint8_t* dst) {    // DC
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@@ -1089,37 +1083,37 @@ static void DC16_SSE2(uint8_t* dst) {  // DC
  }
  {
    const int DC = _mm_cvtsi128_si32(sum) + left + 16;
-    Put16_SSE2(DC >> 5, dst);
+    Put16(DC >> 5, dst);
  }
 }

-static void DC16NoTop_SSE2(uint8_t* dst) {  // DC with top samples unavailable
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
    DC += dst[-1 + j * BPS];
  }
-  Put16_SSE2(DC >> 4, dst);
+  Put16(DC >> 4, dst);
 }

-static void DC16NoLeft_SSE2(uint8_t* dst) {  // DC with left samples unavailable
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
  const int DC = _mm_cvtsi128_si32(sum) + 8;
-  Put16_SSE2(DC >> 4, dst);
+  Put16(DC >> 4, dst);
 }

-static void DC16NoTopLeft_SSE2(uint8_t* dst) {  // DC with no top & left samples
-  Put16_SSE2(0x80, dst);
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
 }

 //------------------------------------------------------------------------------
 // Chroma

-static void VE8uv_SSE2(uint8_t* dst) {    // vertical
+static void VE8uv(uint8_t* dst) {    // vertical
  int j;
  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
  for (j = 0; j < 8; ++j) {
@@ -1127,8 +1121,17 @@ static void VE8uv_SSE2(uint8_t* dst) {    // vertical
  }
 }

+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 8; ++j) {
@@ -1136,7 +1139,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
  }
 }

-static void DC8uv_SSE2(uint8_t* dst) {     // DC
+static void DC8uv(uint8_t* dst) {     // DC
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
  const __m128i sum = _mm_sad_epu8(top, zero);
@@ -1147,29 +1150,29 @@ static void DC8uv_SSE2(uint8_t* dst) {     // DC
  }
  {
    const int DC = _mm_cvtsi128_si32(sum) + left + 8;
-    Put8x8uv_SSE2(DC >> 4, dst);
+    Put8x8uv(DC >> 4, dst);
  }
 }

-static void DC8uvNoLeft_SSE2(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
  const __m128i zero = _mm_setzero_si128();
  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
  const __m128i sum = _mm_sad_epu8(top, zero);
  const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv_SSE2(DC >> 3, dst);
+  Put8x8uv(DC >> 3, dst);
 }

-static void DC8uvNoTop_SSE2(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
-  Put8x8uv_SSE2(dc0 >> 3, dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

-static void DC8uvNoTopLeft_SSE2(uint8_t* dst) {    // DC with nothing
-  Put8x8uv_SSE2(0x80, dst);
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
 }

 //------------------------------------------------------------------------------
@@ -1178,46 +1181,47 @@ static void DC8uvNoTopLeft_SSE2(uint8_t* dst) {    // DC with nothing
 extern void VP8DspInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
-  VP8Transform = Transform_SSE2;
-#if (USE_TRANSFORM_AC3 == 1)
-  VP8TransformAC3 = TransformAC3_SSE2;
+  VP8Transform = Transform;
+#if defined(USE_TRANSFORM_AC3)
+  VP8TransformAC3 = TransformAC3;
 #endif

-  VP8VFilter16 = VFilter16_SSE2;
-  VP8HFilter16 = HFilter16_SSE2;
-  VP8VFilter8 = VFilter8_SSE2;
-  VP8HFilter8 = HFilter8_SSE2;
-  VP8VFilter16i = VFilter16i_SSE2;
-  VP8HFilter16i = HFilter16i_SSE2;
-  VP8VFilter8i = VFilter8i_SSE2;
-  VP8HFilter8i = HFilter8i_SSE2;
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;

-  VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
-  VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
-  VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
-  VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;

-  VP8PredLuma4[1] = TM4_SSE2;
-  VP8PredLuma4[2] = VE4_SSE2;
-  VP8PredLuma4[4] = RD4_SSE2;
-  VP8PredLuma4[5] = VR4_SSE2;
-  VP8PredLuma4[6] = LD4_SSE2;
-  VP8PredLuma4[7] = VL4_SSE2;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;

-  VP8PredLuma16[0] = DC16_SSE2;
-  VP8PredLuma16[1] = TM16_SSE2;
-  VP8PredLuma16[2] = VE16_SSE2;
-  VP8PredLuma16[3] = HE16_SSE2;
-  VP8PredLuma16[4] = DC16NoTop_SSE2;
-  VP8PredLuma16[5] = DC16NoLeft_SSE2;
-  VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;

-  VP8PredChroma8[0] = DC8uv_SSE2;
-  VP8PredChroma8[1] = TM8uv_SSE2;
-  VP8PredChroma8[2] = VE8uv_SSE2;
-  VP8PredChroma8[4] = DC8uvNoTop_SSE2;
-  VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
-  VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/dec_sse41.c
+++ b/src/dsp/dec_sse41.c
@@ -11,15 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE41)

 #include <smmintrin.h>
-#include "src/dec/vp8i_dec.h"
-#include "src/utils/utils.h"
+#include "../dec/vp8i_dec.h"
+#include "../utils/utils.h"

-static void HE16_SSE41(uint8_t* dst) {     // horizontal
+static void HE16(uint8_t* dst) {     // horizontal
  int j;
  const __m128i kShuffle3 = _mm_set1_epi8(3);
  for (j = 16; j > 0; --j) {
@@ -36,7 +36,7 @@ static void HE16_SSE41(uint8_t* dst) {     // horizontal
 extern void VP8DspInitSSE41(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
-  VP8PredLuma16[3] = HE16_SSE41;
+  VP8PredLuma16[3] = HE16;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/dec_wasm.c
+++ b/src/dsp/dec_wasm.c
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -15,10 +15,10 @@
 #define WEBP_DSP_DSP_H_

 #ifdef HAVE_CONFIG_H
-#include "src/webp/config.h"
+#include "../webp/config.h"
 #endif

-#include "src/webp/types.h"
+#include "../webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@@ -51,8 +51,9 @@ extern "C" {
 # define __has_builtin(x) 0
 #endif

-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
+// For now, none of the optimizations below are available in emscripten.
+// WebAssembly overrides native optimizations.
+#if !(defined(EMSCRIPTEN) || defined(WEBP_USE_WASM))

 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
@@ -104,7 +105,7 @@ extern "C" {
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
+#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
 #define WEBP_USE_MIPS_DSP_R2
 #endif
 #endif
@@ -116,22 +117,6 @@ extern "C" {

 #endif  /* EMSCRIPTEN */

-#ifndef WEBP_DSP_OMIT_C_CODE
-#define WEBP_DSP_OMIT_C_CODE 1
-#endif
-
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
-#define WEBP_NEON_OMIT_C_CODE 1
-#else
-#define WEBP_NEON_OMIT_C_CODE 0
-#endif
-
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
-#define WEBP_NEON_WORK_AROUND_GCC 1
-#else
-#define WEBP_NEON_WORK_AROUND_GCC 0
-#endif
-
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@@ -161,18 +146,6 @@ extern "C" {
 #endif
 #endif

-// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
-#if !defined(WEBP_SWAP_16BIT_CSP)
-#define WEBP_SWAP_16BIT_CSP 0
-#endif
-
-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
 typedef enum {
  kSSE2,
  kSSE3,
@@ -183,11 +156,12 @@ typedef enum {
  kNEON,
  kMIPS32,
  kMIPSdspR2,
-  kMSA
+  kMSA,
+  kWASM
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
+WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;

 //------------------------------------------------------------------------------
 // Init stub generator
@@ -315,7 +289,6 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
                                        int xo, int yo,  // center position
                                        int W, int H);   // plane dimension

-#if !defined(WEBP_REDUCE_SIZE)
 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
 typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@@ -323,13 +296,10 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,

 extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
 extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping
-#endif

-#if !defined(WEBP_DISABLE_STATS)
 typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
                                         const uint8_t* src2, int len);
 extern VP8AccumulateSSEFunc VP8AccumulateSSE;
-#endif

 // must be called before using any of the above directly
 void VP8SSIMDspInit(void);
@@ -510,12 +480,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;

 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
-extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
-extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
-extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
+extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);

 // Main entry calls:
 extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@@ -581,29 +551,25 @@ void WebPMultRows(uint8_t* ptr, int stride,
                  int width, int num_rows, int inverse);

 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
-                   int width, int inverse);
-void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
-
-#ifdef WORDS_BIGENDIAN
-// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
-                            const uint8_t* g, const uint8_t* b, int len,
-                            uint32_t* out);
-#endif
-
-// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                           int len, int step, uint32_t* out);
-
-// This function returns true if src[i] contains a value different from 0xff.
-extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
-// This function returns true if src[4*i] contains a value different from 0xff.
-extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse);
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);

 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);

+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
+                           const uint8_t* g, const uint8_t* b, int len,
+                           uint32_t* out);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                          int len, int step, uint32_t* out);
+
+// To be called first before using the above.
+void VP8EncDspARGBInit(void);
+
 //------------------------------------------------------------------------------
 // Filter functions

--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -14,18 +14,16 @@
 #include <assert.h>
 #include <stdlib.h>  // for abs()

-#include "src/dsp/dsp.h"
-#include "src/enc/vp8i_enc.h"
+#include "./dsp.h"
+#include "../enc/vp8i_enc.h"

 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }

-#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int clip_max(int v, int max) {
  return (v > max) ? max : v;
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
@@ -58,10 +56,9 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
  histo->last_non_zero = last_non_zero;
 }

-#if !WEBP_NEON_OMIT_C_CODE
-static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
-                               int start_block, int end_block,
-                               VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
@@ -79,7 +76,6 @@ static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
  }
  VP8SetHistogramData(distribution, histo);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // run-time tables (~4k)
@@ -104,8 +100,6 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

-#if !WEBP_NEON_OMIT_C_CODE
-
 #define STORE(x, y, v) \
  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))

@@ -146,15 +140,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  }
 }

-static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                         int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@@ -182,16 +176,13 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
-                          int16_t* out) {
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  VP8FTransform(src, ref, out);
  VP8FTransform(src + 4, ref + 4, out + 16);
 }

-#if !WEBP_NEON_OMIT_C_CODE
-static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+static void FTransformWHT(const int16_t* in, int16_t* out) {
  // input is 12b signed
  int32_t tmp[16];
  int i;
@@ -220,7 +211,6 @@ static void FTransformWHT_C(const int16_t* in, int16_t* out) {
    out[12 + i] = b3 >> 1;
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef MUL
 #undef STORE
@@ -313,8 +303,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
-                               const uint8_t* top) {
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
  // U block
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
@@ -333,8 +323,8 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds_C(uint8_t* dst,
-                           const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  VerticalPred(I16VE16 + dst, top, 16);
  HorizontalPred(I16HE16 + dst, left, 16);
@@ -517,7 +507,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@@ -533,7 +523,6 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric

-#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                              int w, int h) {
  int count = 0;
@@ -549,21 +538,20 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
  return count;
 }

-static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 4, 4);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
  int k, x, y;
  for (k = 0; k < 4; ++k) {
    uint32_t avg = 0;
@@ -583,7 +571,6 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.

-#if !WEBP_NEON_OMIT_C_CODE
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
@@ -621,25 +608,24 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  return sum;
 }

-static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
  return abs(sum2 - sum1) >> 5;
 }

-static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_C(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Quantization
@@ -650,8 +636,8 @@ static const uint8_t kZigzag[16] = {
 };

 // Simple quantization
-static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
-                           const VP8Matrix* const mtx) {
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
  int last = -1;
  int n;
  for (n = 0; n < 16; ++n) {
@@ -676,15 +662,13 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
-                             const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Block copy
@@ -698,11 +682,11 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  }
 }

-static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4(const uint8_t* src, uint8_t* dst) {
  Copy(src, dst, 4, 4);
 }

-static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8(const uint8_t* src, uint8_t* dst) {
  Copy(src, dst, 16, 8);
 }

@@ -750,32 +734,26 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  InitTables();

  // default C implementations
-#if !WEBP_NEON_OMIT_C_CODE
-  VP8ITransform = ITransform_C;
-  VP8FTransform = FTransform_C;
-  VP8FTransformWHT = FTransformWHT_C;
-  VP8TDisto4x4 = Disto4x4_C;
-  VP8TDisto16x16 = Disto16x16_C;
-  VP8CollectHistogram = CollectHistogram_C;
-  VP8SSE16x16 = SSE16x16_C;
-  VP8SSE16x8 = SSE16x8_C;
-  VP8SSE8x8 = SSE8x8_C;
-  VP8SSE4x4 = SSE4x4_C;
-#endif
-
-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-  VP8EncQuantizeBlock = QuantizeBlock_C;
-  VP8EncQuantize2Blocks = Quantize2Blocks_C;
-#endif
-
-  VP8FTransform2 = FTransform2_C;
-  VP8EncPredLuma4 = Intra4Preds_C;
-  VP8EncPredLuma16 = Intra16Preds_C;
-  VP8EncPredChroma8 = IntraChromaPreds_C;
-  VP8Mean16x4 = Mean16x4_C;
-  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
-  VP8Copy4x4 = Copy4x4_C;
-  VP8Copy16x8 = Copy16x8_C;
+  VP8CollectHistogram = CollectHistogram;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransform2 = FTransform2;
+  VP8FTransformWHT = FTransformWHT;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlock;
+  VP8Copy4x4 = Copy4x4;
+  VP8Copy16x8 = Copy16x8;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@@ -794,6 +772,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
      VP8EncDspInitAVX2();
    }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspInitNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
@@ -810,34 +793,5 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
    }
 #endif
  }
-
-#if defined(WEBP_USE_NEON)
-  if (WEBP_NEON_OMIT_C_CODE ||
-      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
-    VP8EncDspInitNEON();
-  }
-#endif
-
-  assert(VP8ITransform != NULL);
-  assert(VP8FTransform != NULL);
-  assert(VP8FTransformWHT != NULL);
-  assert(VP8TDisto4x4 != NULL);
-  assert(VP8TDisto16x16 != NULL);
-  assert(VP8CollectHistogram != NULL);
-  assert(VP8SSE16x16 != NULL);
-  assert(VP8SSE16x8 != NULL);
-  assert(VP8SSE8x8 != NULL);
-  assert(VP8SSE4x4 != NULL);
-  assert(VP8EncQuantizeBlock != NULL);
-  assert(VP8EncQuantize2Blocks != NULL);
-  assert(VP8FTransform2 != NULL);
-  assert(VP8EncPredLuma4 != NULL);
-  assert(VP8EncPredLuma16 != NULL);
-  assert(VP8EncPredChroma8 != NULL);
-  assert(VP8Mean16x4 != NULL);
-  assert(VP8EncQuantizeBlockWHT != NULL);
-  assert(VP8Copy4x4 != NULL);
-  assert(VP8Copy16x8 != NULL);
-
  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/enc_avx2.c
+++ b/src/dsp/enc_avx2.c
@@ -9,7 +9,7 @@
 //
 // AVX2 version of speed-critical encoding functions.

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_AVX2)

--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -13,13 +13,13 @@
 //            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 //            Slobodan Prijic  (slobodan.prijic@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS32)

-#include "src/dsp/mips_macro.h"
-#include "src/enc/vp8i_enc.h"
-#include "src/enc/cost_enc.h"
+#include "./mips_macro.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@@ -113,9 +113,8 @@ static const int kC2 = 35468;
  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"

 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
-                                             const int16_t* in,
-                                             uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@@ -145,11 +144,11 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
  );
 }

-static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
-                              uint8_t* dst, int do_two) {
-  ITransformOne_MIPS32(ref, in, dst);
+static void ITransform(const uint8_t* ref, const int16_t* in,
+                       uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
  if (do_two) {
-    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
+    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

@@ -188,8 +187,8 @@ static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
  "sh           %[level],       " #N "(%[pout])                     \n\t"

-static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
-                                const VP8Matrix* const mtx) {
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int sign, coeff, level, i;
  int max_level = MAX_LEVEL;
@@ -239,11 +238,11 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
  return 0;
 }

-static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
-                                  const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
-  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }

@@ -362,8 +361,8 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
  "msub   %[temp6],  %[temp0]                \n\t"                \
  "msub   %[temp7],  %[temp1]                \n\t"

-static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
  int tmp[32];
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

@@ -397,13 +396,13 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS

-static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
-                             const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@@ -479,8 +478,7 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

-static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
-                              int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
  int temp17, temp18, temp19, temp20;
@@ -541,7 +539,7 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)

-static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@@ -575,7 +573,7 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@@ -601,7 +599,7 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@@ -623,7 +621,7 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@@ -653,20 +651,17 @@ static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
 extern void VP8EncDspInitMIPS32(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
-  VP8ITransform = ITransform_MIPS32;
-  VP8FTransform = FTransform_MIPS32;
-
-  VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
-  VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
-
-  VP8TDisto4x4 = Disto4x4_MIPS32;
-  VP8TDisto16x16 = Disto16x16_MIPS32;
-
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16_MIPS32;
-  VP8SSE8x8 = SSE8x8_MIPS32;
-  VP8SSE16x8 = SSE16x8_MIPS32;
-  VP8SSE4x4 = SSE4x4_MIPS32;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
 #endif
 }

--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -12,13 +12,13 @@
 // Author(s): Darko Laus (darko.laus@imgtec.com)
 //            Mirko Raus (mirko.raus@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "src/dsp/mips_macro.h"
-#include "src/enc/cost_enc.h"
-#include "src/enc/vp8i_enc.h"
+#include "./mips_macro.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"

 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@@ -141,8 +141,7 @@ static const int kC2 = 35468;
  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"

-static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
-                                 int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const int c2217 = 2217;
  const int c5352 = 5352;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -239,16 +238,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  );
 }

-static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
-                                 uint8_t* dst, int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
-                              const uint16_t* const w) {
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;

@@ -314,14 +313,13 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
  return abs(temp3 - temp17) >> 5;
 }

-static int Disto16x16_MIPSdspR2(const uint8_t* const a,
-                                const uint8_t* const b,
-                                const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@@ -1013,8 +1011,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
-                                       const uint8_t* top) {
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
  // U block
  DCMode8(C8DC8 + dst, left, top);
  VerticalPred8(C8VE8 + dst, top);
@@ -1033,8 +1031,8 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds_MIPSdspR2(uint8_t* dst,
-                                   const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
  DCMode16(I16DC16 + dst, left, top);
  VerticalPred16(I16VE16 + dst, top);
  HorizontalPred16(I16HE16 + dst, left);
@@ -1043,7 +1041,7 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst,

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@@ -1079,7 +1077,7 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
  GET_SSE_INNER(C)                        \
  GET_SSE_INNER(D)

-static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@@ -1109,7 +1107,7 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@@ -1131,7 +1129,7 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@@ -1149,7 +1147,7 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@@ -1272,8 +1270,8 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
 "3:                                                          \n\t"

-static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
-                                   const VP8Matrix* const mtx) {
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
  int sign, coeff, level;
  int max_level = MAX_LEVEL;
@@ -1313,11 +1311,11 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
  return (ret != 0);
 }

-static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
-                                     const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
-  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }

@@ -1360,7 +1358,7 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"

-static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+static void FTransformWHT(const int16_t* in, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;

@@ -1452,9 +1450,9 @@ static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
  "sw         %[temp8],  0(%[temp3])                   \n\t"

-static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
-                                       int start_block, int end_block,
-                                       VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@@ -1486,28 +1484,23 @@ static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
 extern void VP8EncDspInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
-  VP8FTransform = FTransform_MIPSdspR2;
-  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
-  VP8ITransform = ITransform_MIPSdspR2;
-
-  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
-  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
-
-  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
-  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
-  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
-
+  VP8FTransform = FTransform;
+  VP8ITransform = ITransform;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8EncPredLuma4 = Intra4Preds;
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16_MIPSdspR2;
-  VP8SSE8x8 = SSE8x8_MIPSdspR2;
-  VP8SSE16x8 = SSE16x8_MIPSdspR2;
-  VP8SSE4x4 = SSE4x4_MIPSdspR2;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
 #endif
-
-  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
-  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
-
-  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8FTransformWHT = FTransformWHT;
+  VP8CollectHistogram = CollectHistogram;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@@ -11,13 +11,13 @@
 //
 // Author:  Prashant Patil   (prashant.patil@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MSA)

 #include <stdlib.h>
-#include "src/dsp/msa_macro.h"
-#include "src/enc/vp8i_enc.h"
+#include "./msa_macro.h"
+#include "../enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Transforms
@@ -69,16 +69,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }

-static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                           int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
-                           int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@@ -131,7 +130,7 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
  SD4(out0, out1, out2, out3, out, 8);
 }

-static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+static void FTransformWHT(const int16_t* in, int16_t* out) {
  v8i16 in0 = { 0 };
  v8i16 in1 = { 0 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
@@ -168,7 +167,7 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
  ST_SH2(out0, out1, out, 8);
 }

-static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+static int TTransform(const uint8_t* in, const uint16_t* w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
  v16i8 src0 = { 0 };
@@ -200,20 +199,20 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
  return sum;
 }

-static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
-  const int sum1 = TTransform_MSA(a, w);
-  const int sum2 = TTransform_MSA(b, w);
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
  return abs(sum2 - sum1) >> 5;
 }

-static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_MSA(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@@ -222,9 +221,9 @@ static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
 //------------------------------------------------------------------------------
 // Histogram

-static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
-                                 int start_block, int end_block,
-                                 VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
@@ -431,7 +430,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG2

-static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@@ -548,8 +547,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
  STORE16x16(out, dst);
 }

-static void Intra16Preds_MSA(uint8_t* dst,
-                             const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
  DCMode16x16(I16DC16 + dst, left, top);
  VerticalPred16x16(I16VE16 + dst, top);
  HorizontalPred16x16(I16HE16 + dst, left);
@@ -670,8 +669,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
  STORE8x8(out, dst);
 }

-static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
  // U block
  DCMode8x8(C8DC8 + dst, left, top);
  VerticalPred8x8(C8VE8 + dst, top);
@@ -712,7 +711,7 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)

-static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -739,7 +738,7 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -758,7 +757,7 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -778,7 +777,7 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  uint32_t sum = 0;
  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@@ -800,8 +799,8 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
 //------------------------------------------------------------------------------
 // Quantization

-static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
-                             const VP8Matrix* const mtx) {
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
  int sum;
  v8i16 in0, in1, sh0, sh1, out0, out1;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@@ -853,8 +852,8 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
  return (sum > 0);
 }

-static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
-                               const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -867,26 +866,26 @@ static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitMSA(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
-  VP8ITransform = ITransform_MSA;
-  VP8FTransform = FTransform_MSA;
-  VP8FTransformWHT = FTransformWHT_MSA;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransformWHT = FTransformWHT;

-  VP8TDisto4x4 = Disto4x4_MSA;
-  VP8TDisto16x16 = Disto16x16_MSA;
-  VP8CollectHistogram = CollectHistogram_MSA;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram;

-  VP8EncPredLuma4 = Intra4Preds_MSA;
-  VP8EncPredLuma16 = Intra16Preds_MSA;
-  VP8EncPredChroma8 = IntraChromaPreds_MSA;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;

-  VP8SSE16x16 = SSE16x16_MSA;
-  VP8SSE16x8 = SSE16x8_MSA;
-  VP8SSE8x8 = SSE8x8_MSA;
-  VP8SSE4x4 = SSE4x4_MSA;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;

-  VP8EncQuantizeBlock = QuantizeBlock_MSA;
-  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
-  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlock;
 }

 #else  // !WEBP_USE_MSA
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -11,14 +11,14 @@
 //
 // adapted from libvpx (http://www.webmproject.org/code/)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_NEON)

 #include <assert.h>

-#include "src/dsp/neon.h"
-#include "src/enc/vp8i_enc.h"
+#include "./neon.h"
+#include "../enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -37,15 +37,15 @@ static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 #if defined(WEBP_USE_INTRINSICS)

 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
 }

 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
-                                                 const int16x8_t dst01,
-                                                 const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
+                                            const int16x8_t dst01,
+                                            const int16x8_t dst23) {
  // Unsigned saturate to 8b.
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -57,10 +57,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }

-static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
-                                    const int16x8_t row23,
-                                    const uint8_t* const ref,
-                                    uint8_t* const dst) {
+static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
+                               const uint8_t* const ref, uint8_t* const dst) {
  uint32x2_t dst01 = vdup_n_u32(0);
  uint32x2_t dst23 = vdup_n_u32(0);

@@ -72,20 +70,19 @@ static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,

  {
    // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);

    // Descale with rounding.
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    // Add the inverse transform.
-    SaturateAndStore4x4_NEON(dst, out01, out23);
+    SaturateAndStore4x4(dst, out01, out23);
  }
 }

-static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
-                                          const int16x8_t in1,
-                                          int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
+                                     int16x8x2_t* const out) {
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
@@ -93,7 +90,7 @@ static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }

-static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
  // {rows} = in0 | in4
  //          in8 | in12
  // B1 = in4 | in12
@@ -116,22 +113,22 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
-  Transpose8x2_NEON(E0, E1, rows);
+  Transpose8x2(E0, E1, rows);
 }

-static void ITransformOne_NEON(const uint8_t* ref,
-                               const int16_t* in, uint8_t* dst) {
+static void ITransformOne(const uint8_t* ref,
+                          const int16_t* in, uint8_t* dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
-  TransformPass_NEON(&rows);
-  TransformPass_NEON(&rows);
-  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
+  TransformPass(&rows);
+  TransformPass(&rows);
+  Add4x4(rows.val[0], rows.val[1], ref, dst);
 }

 #else

-static void ITransformOne_NEON(const uint8_t* ref,
-                               const int16_t* in, uint8_t* dst) {
+static void ITransformOne(const uint8_t* ref,
+                          const int16_t* in, uint8_t* dst) {
  const int kBPS = BPS;
  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };

@@ -246,16 +243,16 @@ static void ITransformOne_NEON(const uint8_t* ref,

 #endif    // WEBP_USE_INTRINSICS

-static void ITransform_NEON(const uint8_t* ref,
-                            const int16_t* in, uint8_t* dst, int do_two) {
-  ITransformOne_NEON(ref, in, dst);
+static void ITransform(const uint8_t* ref,
+                       const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
  if (do_two) {
-    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
+    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

 // Load all 4x4 pixels into a single uint8x16_t variable.
-static uint8x16_t Load4x4_NEON(const uint8_t* src) {
+static uint8x16_t Load4x4(const uint8_t* src) {
  uint32x4_t out = vdupq_n_u32(0);
  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@@ -268,12 +265,10 @@ static uint8x16_t Load4x4_NEON(const uint8_t* src) {

 #if defined(WEBP_USE_INTRINSICS)

-static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
-                                              const int16x4_t B,
-                                              const int16x4_t C,
-                                              const int16x4_t D,
-                                              int16x8_t* const out01,
-                                              int16x8_t* const out32) {
+static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
+                                         const int16x4_t C, const int16x4_t D,
+                                         int16x8_t* const out01,
+                                         int16x8_t* const out32) {
  const int16x4x2_t AB = vtrn_s16(A, B);
  const int16x4x2_t CD = vtrn_s16(C, D);
  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@@ -288,24 +283,24 @@ static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
                   vreinterpret_s64_s32(tmp02.val[1])));
 }

-static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
-                                              const uint8x8_t b) {
+static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
+                                         const uint8x8_t b) {
  return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }

-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
  {
-    const uint8x16_t S0 = Load4x4_NEON(src);
-    const uint8x16_t R0 = Load4x4_NEON(ref);
-    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
-    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
+    const uint8x16_t S0 = Load4x4(src);
+    const uint8x16_t R0 = Load4x4(ref);
+    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
    const int16x4_t D0 = vget_low_s16(D0D1);
    const int16x4_t D1 = vget_high_s16(D0D1);
    const int16x4_t D2 = vget_low_s16(D2D3);
    const int16x4_t D3 = vget_high_s16(D2D3);
-    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
+    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
  }
  {    // 1rst pass
    const int32x4_t kCst937 = vdupq_n_s32(937);
@@ -323,7 +318,7 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
-    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
  }
  {    // 2nd pass
    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
@@ -363,8 +358,8 @@ static const int32_t kCoeff32[] = {
  51000, 51000, 51000, 51000
 };

-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
  const int kBPS = BPS;
  const uint8_t* src_ptr = src;
  const uint8_t* ref_ptr = ref;
@@ -483,7 +478,7 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
  src += stride;                                    \
 } while (0)

-static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+static void FTransformWHT(const int16_t* src, int16_t* out) {
  const int stride = 16;
  const int16x4_t zero = vdup_n_s16(0);
  int32x4x4_t tmp0;
@@ -521,7 +516,7 @@ static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
    tmp0.val[3] = vsubq_s32(a0, a1);
  }
  {
-    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
+    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
    // a0 = tmp[0 + i] + tmp[ 8 + i]
    // a1 = tmp[4 + i] + tmp[12 + i]
    // a2 = tmp[4 + i] - tmp[12 + i]
@@ -565,7 +560,7 @@ static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
 // a 26ae, b 26ae
 // a 37bf, b 37bf
 //
-static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@@ -579,8 +574,7 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
  return q4_in;
 }

-static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
-    const int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@@ -599,7 +593,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
  return q4_out;
 }

-static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
                                                        q4_in.val[2]));
  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@@ -616,7 +610,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
  return q4_out;
 }

-static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
+static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
  int16x4x4_t d4_w;
@@ -628,8 +622,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
  return d4_w;
 }

-static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
-                                           const int16x4x4_t d4_w) {
+static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
+                                      const int16x4x4_t d4_w) {
  int32x2_t d_sum;
  // sum += w[ 0] * abs(b0);
  // sum += w[ 4] * abs(b1);
@@ -658,8 +652,8 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
-                         const uint16_t* const w) {
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@@ -685,12 +679,12 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
    // Vertical pass first to avoid a transpose (vertical and horizontal passes
    // are commutative because w/kWeightY is symmetric) and subsequent
    // transpose.
-    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
-    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
+    const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
+    const int16x4x4_t d4_w = DistoLoadW(w);
    // horizontal pass
-    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
-    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
-    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
+    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
+    int32x2_t d_sum = DistoSum(q4_h, d4_w);

    // abs(sum2 - sum1) >> 5
    d_sum = vabs_s32(d_sum);
@@ -700,13 +694,13 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
 }
 #undef LOAD_LANE_32b

-static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_NEON(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@@ -714,15 +708,15 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,

 //------------------------------------------------------------------------------

-static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
-                                  int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int16_t out[16];
-    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    {
      int k;
      const int16x8_t a0 = vld1q_s16(out + 0);
@@ -746,9 +740,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,

 //------------------------------------------------------------------------------

-static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
-                                             const uint8_t* const b,
-                                             uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
+                                        const uint8_t* const b,
+                                        uint32x4_t* const sum) {
  const uint8x16_t a0 = vld1q_u8(a);
  const uint8x16_t b0 = vld1q_u8(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@@ -763,7 +757,7 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
 }

 // Horizontal sum of all four uint32_t values in 'sum'.
-static int SumToInt_NEON(uint32x4_t sum) {
+static int SumToInt(uint32x4_t sum) {
  const uint64x2_t sum2 = vpaddlq_u32(sum);
  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
  return (int)sum3;
@@ -773,18 +767,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 16; ++y) {
-    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
  }
-  return SumToInt_NEON(sum);
+  return SumToInt(sum);
 }

 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
-    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
  }
-  return SumToInt_NEON(sum);
+  return SumToInt(sum);
 }

 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@@ -797,12 +791,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
    sum = vpadalq_u16(sum, prod);
  }
-  return SumToInt_NEON(sum);
+  return SumToInt(sum);
 }

 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
-  const uint8x16_t a0 = Load4x4_NEON(a);
-  const uint8x16_t b0 = Load4x4_NEON(b);
+  const uint8x16_t a0 = Load4x4(a);
+  const uint8x16_t b0 = Load4x4(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
                                    vget_low_u8(abs_diff));
@@ -811,7 +805,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
  /* pair-wise adds and widen */
  const uint32x4_t sum1 = vpaddlq_u16(prod1);
  const uint32x4_t sum2 = vpaddlq_u16(prod2);
-  return SumToInt_NEON(vaddq_u32(sum1, sum2));
+  return SumToInt(vaddq_u32(sum1, sum2));
 }

 //------------------------------------------------------------------------------
@@ -819,8 +813,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)

-static int16x8_t Quantize_NEON(int16_t* const in,
-                               const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize(int16_t* const in,
+                          const VP8Matrix* const mtx, int offset) {
  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@@ -853,10 +847,10 @@ static const uint8_t kShuffles[4][8] = {
  { 14, 15, 22, 23, 28, 29, 30, 31 }
 };

-static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
-  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
-  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize(in, mtx, 0);
+  const int16x8_t out1 = Quantize(in, mtx, 8);
  uint8x8x4_t shuffles;
  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
  // non-standard versions there.
@@ -895,11 +889,11 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
  return 0;
 }

-static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
-  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }

@@ -911,14 +905,14 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitNEON(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
-  VP8ITransform = ITransform_NEON;
-  VP8FTransform = FTransform_NEON;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;

-  VP8FTransformWHT = FTransformWHT_NEON;
+  VP8FTransformWHT = FTransformWHT;

-  VP8TDisto4x4 = Disto4x4_NEON;
-  VP8TDisto16x16 = Disto16x16_NEON;
-  VP8CollectHistogram = CollectHistogram_NEON;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram;

  VP8SSE16x16 = SSE16x16_NEON;
  VP8SSE16x8 = SSE16x8_NEON;
@@ -926,8 +920,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
  VP8SSE4x4 = SSE4x4_NEON;

 #if !defined(WORK_AROUND_GCC)
-  VP8EncQuantizeBlock = QuantizeBlock_NEON;
-  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
 #endif
 }

--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -11,23 +11,23 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>

-#include "src/dsp/common_sse2.h"
-#include "src/enc/cost_enc.h"
-#include "src/enc/vp8i_enc.h"
+#include "./common_sse2.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

 // Does one or two inverse transforms.
-static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                            int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,10 +193,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  }
 }

-static void FTransformPass1_SSE2(const __m128i* const in01,
-                                 const __m128i* const in23,
-                                 __m128i* const out01,
-                                 __m128i* const out32) {
+static void FTransformPass1(const __m128i* const in01,
+                            const __m128i* const in23,
+                            __m128i* const out01,
+                            __m128i* const out32) {
  const __m128i k937 = _mm_set1_epi32(937);
  const __m128i k1812 = _mm_set1_epi32(1812);

@@ -239,9 +239,8 @@ static void FTransformPass1_SSE2(const __m128i* const in01,
  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
 }

-static void FTransformPass2_SSE2(const __m128i* const v01,
-                                 const __m128i* const v32,
-                                 int16_t* out) {
+static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
+                            int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
@@ -292,8 +291,7 @@ static void FTransformPass2_SSE2(const __m128i* const v01,
  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }

-static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  // Load src.
  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@@ -330,14 +328,13 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
  __m128i v01, v32;

  // First pass
-  FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
+  FTransformPass1(&row01, &row23, &v01, &v32);

  // Second pass
-  FTransformPass2_SSE2(&v01, &v32, out);
+  FTransformPass2(&v01, &v32, out);
 }

-static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
-                             int16_t* out) {
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const __m128i zero = _mm_setzero_si128();

  // Load src and convert to 16b.
@@ -377,15 +374,15 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
  __m128i v01h, v32h;

  // First pass
-  FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
-  FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
+  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);

  // Second pass
-  FTransformPass2_SSE2(&v01l, &v32l, out + 0);
-  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
+  FTransformPass2(&v01l, &v32l, out + 0);
+  FTransformPass2(&v01h, &v32h, out + 16);
 }

-static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@@ -401,14 +398,14 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
  *out = _mm_madd_epi16(D, kMult);
 }

-static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
+static void FTransformWHT(const int16_t* in, int16_t* out) {
  // Input is 12b signed.
  __m128i row0, row1, row2, row3;
  // Rows are 14b signed.
-  FTransformWHTRow_SSE2(in + 0 * 64, &row0);
-  FTransformWHTRow_SSE2(in + 1 * 64, &row1);
-  FTransformWHTRow_SSE2(in + 2 * 64, &row2);
-  FTransformWHTRow_SSE2(in + 3 * 64, &row3);
+  FTransformWHTRow(in + 0 * 64, &row0);
+  FTransformWHTRow(in + 1 * 64, &row1);
+  FTransformWHTRow(in + 2 * 64, &row2);
+  FTransformWHTRow(in + 3 * 64, &row3);

  {
    // The a* are 15b signed.
@@ -434,9 +431,9 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.

-static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
-                                  int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
@@ -445,7 +442,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
    int16_t out[16];
    int k;

-    FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

    // Convert coefficients to bin (within out[]).
    {
@@ -479,7 +476,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
 // Intra predictions

 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 8; ++j) {
@@ -487,7 +484,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
  }
 }

-static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
  int j;
  const __m128i values = _mm_set1_epi8(v);
  for (j = 0; j < 16; ++j) {
@@ -495,20 +492,20 @@ static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
  }
 }

-static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  if (size == 4) {
    int j;
    for (j = 0; j < 4; ++j) {
      memset(dst + j * BPS, value, 4);
    }
  } else if (size == 8) {
-    Put8x8uv_SSE2(value, dst);
+    Put8x8uv(value, dst);
  } else {
-    Put16_SSE2(value, dst);
+    Put16(value, dst);
  }
 }

-static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
  int j;
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  for (j = 0; j < 8; ++j) {
@@ -516,7 +513,7 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
  }
 }

-static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
  const __m128i top_values = _mm_load_si128((const __m128i*)top);
  int j;
  for (j = 0; j < 16; ++j) {
@@ -524,20 +521,20 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
  }
 }

-static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
-                                          const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
  if (top != NULL) {
    if (size == 8) {
-      VE8uv_SSE2(dst, top);
+      VE8uv(dst, top);
    } else {
-      VE16_SSE2(dst, top);
+      VE16(dst, top);
    }
  } else {
-    Fill_SSE2(dst, 127, size);
+    Fill(dst, 127, size);
  }
 }

-static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
  int j;
  for (j = 0; j < 8; ++j) {
    const __m128i values = _mm_set1_epi8(left[j]);
@@ -546,7 +543,7 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
  }
 }

-static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
  int j;
  for (j = 0; j < 16; ++j) {
    const __m128i values = _mm_set1_epi8(left[j]);
@@ -555,21 +552,21 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
  }
 }

-static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
-                                            const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
  if (left != NULL) {
    if (size == 8) {
-      HE8uv_SSE2(dst, left);
+      HE8uv(dst, left);
    } else {
-      HE16_SSE2(dst, left);
+      HE16(dst, left);
    }
  } else {
-    Fill_SSE2(dst, 129, size);
+    Fill(dst, 129, size);
  }
 }

-static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
-                                const uint8_t* top, int size) {
+static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
+                           const uint8_t* top, int size) {
  const __m128i zero = _mm_setzero_si128();
  int y;
  if (size == 8) {
@@ -596,13 +593,13 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
  }
 }

-static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
-                                        const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
  if (left != NULL) {
    if (top != NULL) {
-      TM_SSE2(dst, left, top, size);
+      TM(dst, left, top, size);
    } else {
-      HorizontalPred_SSE2(dst, left, size);
+      HorizontalPred(dst, left, size);
    }
  } else {
    // true motion without left samples (hence: with default 129 value)
@@ -610,90 +607,90 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
    // Note that if top samples are not available, the default value is
    // then 129, and not 127 as in the VerticalPred case.
    if (top != NULL) {
-      VerticalPred_SSE2(dst, top, size);
+      VerticalPred(dst, top, size);
    } else {
-      Fill_SSE2(dst, 129, size);
+      Fill(dst, 129, size);
    }
  }
 }

-static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top) {
+static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
+                              const uint8_t* top) {
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
  const int DC = VP8HorizontalAdd8b(&combined) + 8;
-  Put8x8uv_SSE2(DC >> 4, dst);
+  Put8x8uv(DC >> 4, dst);
 }

-static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  const __m128i sum = _mm_sad_epu8(top_values, zero);
  const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv_SSE2(DC >> 3, dst);
+  Put8x8uv(DC >> 3, dst);
 }

-static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
  // 'left' is contiguous so we can reuse the top summation.
-  DC8uvNoLeft_SSE2(dst, left);
+  DC8uvNoLeft(dst, left);
 }

-static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
-  Put8x8uv_SSE2(0x80, dst);
+static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
+  Put8x8uv(0x80, dst);
 }

-static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
-                                       const uint8_t* top) {
+static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
  if (top != NULL) {
    if (left != NULL) {  // top and left present
-      DC8uv_SSE2(dst, left, top);
+      DC8uv(dst, left, top);
    } else {  // top, but no left
-      DC8uvNoLeft_SSE2(dst, top);
+      DC8uvNoLeft(dst, top);
    }
  } else if (left != NULL) {  // left but no top
-    DC8uvNoTop_SSE2(dst, left);
+    DC8uvNoTop(dst, left);
  } else {  // no top, no left, nothing.
-    DC8uvNoTopLeft_SSE2(dst);
+    DC8uvNoTopLeft(dst);
  }
 }

-static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
  const __m128i top_row = _mm_load_si128((const __m128i*)top);
  const __m128i left_row = _mm_load_si128((const __m128i*)left);
  const int DC =
      VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
-  Put16_SSE2(DC >> 5, dst);
+  Put16(DC >> 5, dst);
 }

-static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
  const __m128i top_row = _mm_load_si128((const __m128i*)top);
  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
-  Put16_SSE2(DC >> 4, dst);
+  Put16(DC >> 4, dst);
 }

-static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
  // 'left' is contiguous so we can reuse the top summation.
-  DC16NoLeft_SSE2(dst, left);
+  DC16NoLeft(dst, left);
 }

-static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
-  Put16_SSE2(0x80, dst);
+static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
+  Put16(0x80, dst);
 }

-static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
-                                      const uint8_t* top) {
+static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
  if (top != NULL) {
    if (left != NULL) {  // top and left present
-      DC16_SSE2(dst, left, top);
+      DC16(dst, left, top);
    } else {  // top, but no left
-      DC16NoLeft_SSE2(dst, top);
+      DC16NoLeft(dst, top);
    }
  } else if (left != NULL) {  // left but no top
-    DC16NoTop_SSE2(dst, left);
+    DC16NoTop(dst, left);
  } else {  // no top, no left, nothing.
-    DC16NoTopLeft_SSE2(dst);
+    DC16NoTopLeft(dst);
  }
 }

@@ -712,8 +709,7 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1

-static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // vertical
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -729,8 +725,7 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
  }
 }

-static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // horizontal
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@@ -742,15 +737,14 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }

-static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
-  Fill_SSE2(dst, dc >> 3, 4);
+  Fill(dst, dc >> 3, 4);
 }

-static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Down-Left
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -766,8 +760,8 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }

-static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Vertical-Right
+static WEBP_INLINE void VR4(uint8_t* dst,
+                            const uint8_t* top) {  // Vertical-Right
  const __m128i one = _mm_set1_epi8(1);
  const int I = top[-2];
  const int J = top[-3];
@@ -792,8 +786,8 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
  DST(0, 3) = AVG3(K, J, I);
 }

-static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Vertical-Left
+static WEBP_INLINE void VL4(uint8_t* dst,
+                            const uint8_t* top) {  // Vertical-Left
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -818,8 +812,7 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
  DST(3, 3) = (extra_out >> 8) & 0xff;
 }

-static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Down-right
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
  const __m128i one = _mm_set1_epi8(1);
  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@@ -835,7 +828,7 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }

-static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@@ -850,7 +843,7 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@@ -873,7 +866,7 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }

-static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@@ -895,56 +888,55 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
-  DC4_SSE2(I4DC4 + dst, top);
-  TM4_SSE2(I4TM4 + dst, top);
-  VE4_SSE2(I4VE4 + dst, top);
-  HE4_SSE2(I4HE4 + dst, top);
-  RD4_SSE2(I4RD4 + dst, top);
-  VR4_SSE2(I4VR4 + dst, top);
-  LD4_SSE2(I4LD4 + dst, top);
-  VL4_SSE2(I4VL4 + dst, top);
-  HD4_SSE2(I4HD4 + dst, top);
-  HU4_SSE2(I4HU4 + dst, top);
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
 }

 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
  // U block
-  DC8uvMode_SSE2(C8DC8 + dst, left, top);
-  VerticalPred_SSE2(C8VE8 + dst, top, 8);
-  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
-  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+  DC8uvMode(C8DC8 + dst, left, top);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
  // V block
  dst += 8;
  if (top != NULL) top += 8;
  if (left != NULL) left += 16;
-  DC8uvMode_SSE2(C8DC8 + dst, left, top);
-  VerticalPred_SSE2(C8VE8 + dst, top, 8);
-  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
-  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+  DC8uvMode(C8DC8 + dst, left, top);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
 }

 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds_SSE2(uint8_t* dst,
-                              const uint8_t* left, const uint8_t* top) {
-  DC16Mode_SSE2(I16DC16 + dst, left, top);
-  VerticalPred_SSE2(I16VE16 + dst, top, 16);
-  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
-  TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DC16Mode(I16DC16 + dst, left, top);
+  VerticalPred(I16VE16 + dst, top, 16);
+  HorizontalPred(I16HE16 + dst, left, 16);
+  TrueMotion(I16TM16 + dst, left, top, 16);
 }

 //------------------------------------------------------------------------------
 // Metric

-static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
-                                                   const __m128i b,
-                                                   __m128i* const sum) {
+static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
+                                              __m128i* const sum) {
  // take abs(a-b) in 8b
  const __m128i a_b = _mm_subs_epu8(a, b);
  const __m128i b_a = _mm_subs_epu8(b, a);
@@ -959,8 +951,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
  *sum = _mm_add_epi32(sum1, sum2);
 }

-static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
-                                     int num_pairs) {
+static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
+                                int num_pairs) {
  __m128i sum = _mm_setzero_si128();
  int32_t tmp[4];
  int i;
@@ -971,8 +963,8 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
    __m128i sum1, sum2;
-    SubtractAndAccumulate_SSE2(a0, b0, &sum1);
-    SubtractAndAccumulate_SSE2(a1, b1, &sum2);
+    SubtractAndAccumulate(a0, b0, &sum1);
+    SubtractAndAccumulate(a1, b1, &sum2);
    sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
    a += 2 * BPS;
    b += 2 * BPS;
@@ -981,18 +973,18 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }

-static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN_SSE2(a, b, 8);
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN(a, b, 8);
 }

-static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN_SSE2(a, b, 4);
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN(a, b, 4);
 }

 #define LOAD_8x16b(ptr) \
  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)

-static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();
  int num_pairs = 4;
  __m128i sum = zero;
@@ -1019,7 +1011,7 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
 }
 #undef LOAD_8x16b

-static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();

  // Load values. Note that we read 8 pixels instead of 4,
@@ -1056,7 +1048,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {

 //------------------------------------------------------------------------------

-static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
  const __m128i mask = _mm_set1_epi16(0x00ff);
  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@@ -1094,8 +1086,8 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
-                           const uint16_t* const w) {
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+                      const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
@@ -1195,19 +1187,19 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }

-static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
-                         const uint16_t* const w) {
-  const int diff_sum = TTransform_SSE2(a, b, w);
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int diff_sum = TTransform(a, b, w);
  return abs(diff_sum) >> 5;
 }

-static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_SSE2(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@@ -1217,9 +1209,9 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //

-static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
-                                            const uint16_t* const sharpen,
-                                            const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
+                                       const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
@@ -1329,22 +1321,22 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
 }

-static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
-  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
 }

-static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
-                                 const VP8Matrix* const mtx) {
-  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, NULL, mtx);
 }

-static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
  const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
  return nz;
 }

@@ -1354,24 +1346,24 @@ static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
-  VP8CollectHistogram = CollectHistogram_SSE2;
-  VP8EncPredLuma16 = Intra16Preds_SSE2;
-  VP8EncPredChroma8 = IntraChromaPreds_SSE2;
-  VP8EncPredLuma4 = Intra4Preds_SSE2;
-  VP8EncQuantizeBlock = QuantizeBlock_SSE2;
-  VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
-  VP8ITransform = ITransform_SSE2;
-  VP8FTransform = FTransform_SSE2;
-  VP8FTransform2 = FTransform2_SSE2;
-  VP8FTransformWHT = FTransformWHT_SSE2;
-  VP8SSE16x16 = SSE16x16_SSE2;
-  VP8SSE16x8 = SSE16x8_SSE2;
-  VP8SSE8x8 = SSE8x8_SSE2;
-  VP8SSE4x4 = SSE4x4_SSE2;
-  VP8TDisto4x4 = Disto4x4_SSE2;
-  VP8TDisto16x16 = Disto16x16_SSE2;
-  VP8Mean16x4 = Mean16x4_SSE2;
+  VP8CollectHistogram = CollectHistogram;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransform2 = FTransform2;
+  VP8FTransformWHT = FTransformWHT;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/enc_sse41.c
+++ b/src/dsp/enc_sse41.c
@@ -11,21 +11,21 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE41)
 #include <smmintrin.h>
 #include <stdlib.h>  // for abs()

-#include "src/dsp/common_sse2.h"
-#include "src/enc/vp8i_enc.h"
+#include "./common_sse2.h"
+#include "../enc/vp8i_enc.h"

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.

-static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
-                                   int start_block, int end_block,
-                                   VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@@ -70,8 +70,8 @@ static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
-                            const uint16_t* const w) {
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+                      const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;

@@ -168,19 +168,19 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }

-static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
-  const int diff_sum = TTransform_SSE41(a, b, w);
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int diff_sum = TTransform(a, b, w);
  return abs(diff_sum) >> 5;
 }

-static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
-                            const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4_SSE41(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
    }
  }
  return D;
@@ -197,9 +197,9 @@ static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
               2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
               2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)

-static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
-                                             const uint16_t* const sharpen,
-                                             const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
+                                       const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i out0, out8;
@@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],

 #undef PSHUFB_CST

-static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
-                               const VP8Matrix* const mtx) {
-  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
 }

-static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
-                                  const VP8Matrix* const mtx) {
-  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, NULL, mtx);
 }

-static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
-                                 const VP8Matrix* const mtx) {
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
  int nz;
  const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
  return nz;
 }

@@ -324,12 +324,12 @@ static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],

 extern void VP8EncDspInitSSE41(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
-  VP8CollectHistogram = CollectHistogram_SSE41;
-  VP8EncQuantizeBlock = QuantizeBlock_SSE41;
-  VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
-  VP8TDisto4x4 = Disto4x4_SSE41;
-  VP8TDisto16x16 = Disto16x16_SSE41;
+  VP8CollectHistogram = CollectHistogram;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/filters.c
+++ b/src/dsp/filters.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,17 +20,16 @@
 // Helpful macro.

 # define SANITY_CHECK(in, out)                                                 \
-  assert((in) != NULL);                                                        \
-  assert((out) != NULL);                                                       \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
  assert(width > 0);                                                           \
  assert(height > 0);                                                          \
  assert(stride >= width);                                                     \
  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
  (void)height;  // Silence unused warning.

-#if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
-                                      uint8_t* dst, int length, int inverse) {
+static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
+                                    uint8_t* dst, int length, int inverse) {
  int i;
  if (inverse) {
    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@@ -42,10 +41,10 @@ static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
 //------------------------------------------------------------------------------
 // Horizontal filter.

-static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
-                                             int width, int height, int stride,
-                                             int row, int num_rows,
-                                             int inverse, uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@@ -57,7 +56,7 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
  if (row == 0) {
    // Leftmost pixel is the same as input for topmost scanline.
    out[0] = in[0];
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    preds += stride;
    in += stride;
@@ -67,8 +66,8 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
  // Filter line-by-line.
  while (row < last_row) {
    // Leftmost pixel is predicted from above.
-    PredictLine_C(in, preds - stride, out, 1, inverse);
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    ++row;
    preds += stride;
    in += stride;
@@ -79,10 +78,10 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.

-static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@@ -95,7 +94,7 @@ static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    in += stride;
    out += stride;
@@ -106,28 +105,26 @@ static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,

  // Filter line-by-line.
  while (row < last_row) {
-    PredictLine_C(in, preds, out, width, inverse);
+    PredictLine(in, preds, out, width, inverse);
    ++row;
    preds += stride;
    in += stride;
    out += stride;
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Gradient filter.

-static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
  const int g = a + b - c;
  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }

-#if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@@ -139,7 +136,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    preds += stride;
    in += stride;
@@ -150,11 +147,11 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
  while (row < last_row) {
    int w;
    // leftmost pixel: predict from above.
-    PredictLine_C(in, preds - stride, out, 1, inverse);
+    PredictLine(in, preds - stride, out, 1, inverse);
    for (w = 1; w < width; ++w) {
-      const int pred = GradientPredictor_C(preds[w - 1],
-                                           preds[w - stride],
-                                           preds[w - stride - 1]);
+      const int pred = GradientPredictor(preds[w - 1],
+                                         preds[w - stride],
+                                         preds[w - stride - 1]);
      out[w] = in[w] + (inverse ? pred : -pred);
    }
    ++row;
@@ -163,34 +160,32 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
    out += stride;
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef SANITY_CHECK

 //------------------------------------------------------------------------------

-#if !WEBP_NEON_OMIT_C_CODE
-static void HorizontalFilter_C(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
-                       filtered_data);
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }

-static void VerticalFilter_C(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }

-static void GradientFilter_C(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE
+

 //------------------------------------------------------------------------------

-static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
-                                 uint8_t* out, int width) {
+static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
  uint8_t pred = (prev == NULL) ? 0 : prev[0];
  int i;
  for (i = 0; i < width; ++i) {
@@ -199,28 +194,26 @@ static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
  }
 }

-#if !WEBP_NEON_OMIT_C_CODE
-static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter_C(NULL, in, out, width);
+    HorizontalUnfilter(NULL, in, out, width);
  } else {
    int i;
    for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
  }
 }
-#endif  // !WEBP_NEON_OMIT_C_CODE

-static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter_C(NULL, in, out, width);
+    HorizontalUnfilter(NULL, in, out, width);
  } else {
    uint8_t top = prev[0], top_left = top, left = top;
    int i;
    for (i = 0; i < width; ++i) {
      top = prev[i];  // need to read this first, in case prev==out
-      left = in[i] + GradientPredictor_C(left, top, top_left);
+      left = in[i] + GradientPredictor(left, top, top_left);
      top_left = top;
      out[i] = left;
    }
@@ -245,18 +238,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;

  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
-#if !WEBP_NEON_OMIT_C_CODE
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
-#endif
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;

  WebPFilters[WEBP_FILTER_NONE] = NULL;
-#if !WEBP_NEON_OMIT_C_CODE
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
-#endif
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -264,6 +253,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
      VP8FiltersInitSSE2();
    }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8FiltersInitNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8FiltersInitMIPSdspR2();
@@ -275,20 +269,5 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
    }
 #endif
  }
-
-#if defined(WEBP_USE_NEON)
-  if (WEBP_NEON_OMIT_C_CODE ||
-      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
-    VP8FiltersInitNEON();
-  }
-#endif
-
-  assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
-  assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
-  assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
-  assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
-  assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
-  assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
-
  filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/filters_mips_dsp_r2.c
+++ b/src/dsp/filters_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut (djordje.pesut@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MIPS_DSP_R2)

-#include "src/dsp/dsp.h"
+#include "../dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@@ -101,8 +101,8 @@
    );                                                                         \
  } while (0)

-static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
-                                              int length) {
+static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
+                                    int length) {
  DO_PREDICT_LINE(src, dst, length, 0);
 }

@@ -192,11 +192,10 @@ static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
    }                                                                          \
  } while (0)

-static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
-                                                     int width, int height,
-                                                     int stride,
-                                                     int row, int num_rows,
-                                                     uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@@ -208,7 +207,7 @@ static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
  if (row == 0) {
    // Leftmost pixel is the same as input for topmost scanline.
    out[0] = in[0];
-    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    PredictLine(in + 1, out + 1, width - 1);
    row = 1;
    preds += stride;
    in += stride;
@@ -220,11 +219,9 @@ static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE

-static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
-                                       int width, int height,
-                                       int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
-                               filtered_data);
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
 }

 //------------------------------------------------------------------------------
@@ -240,11 +237,9 @@ static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
    }                                                                          \
  } while (0)

-static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
-                                                   int width, int height,
-                                                   int stride,
-                                                   int row, int num_rows,
-                                                   uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@@ -257,7 +252,7 @@ static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
-    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    PredictLine(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@@ -271,16 +266,15 @@ static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE

-static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
-                                     int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
-                             filtered_data);
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
 }

 //------------------------------------------------------------------------------
 // Gradient filter.

-static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
  int temp0;
  __asm__ volatile (
    "addu             %[temp0],   %[a],       %[b]        \n\t"
@@ -299,9 +293,9 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
      int w;                                                                   \
      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
      for (w = 1; w < width; ++w) {                                            \
-        const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1],             \
-                                                     PREDS[w - stride],        \
-                                                     PREDS[w - stride - 1]);   \
+        const int pred = GradientPredictor(PREDS[w - 1],                       \
+                                           PREDS[w - stride],                  \
+                                           PREDS[w - stride - 1]);             \
        out[w] = in[w] OPERATION pred;                                         \
      }                                                                        \
      ++row;                                                                   \
@@ -310,9 +304,9 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
    }                                                                          \
  } while (0)

-static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
-                                       int width, int height, int stride,
-                                       int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
@@ -324,7 +318,7 @@ static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
-    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    PredictLine(in + 1, out + 1, width - 1);
    row = 1;
    preds += stride;
    in += stride;
@@ -336,39 +330,38 @@ static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE

-static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
-                                     int stride, uint8_t* filtered_data) {
-  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
-                             filtered_data);
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
 }

 //------------------------------------------------------------------------------

-static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
-                                         uint8_t* out, int width) {
+static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
 out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
 DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
 }

-static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
-                                       uint8_t* out, int width) {
+static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+    HorizontalUnfilter(NULL, in, out, width);
  } else {
    DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
  }
 }

-static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
-                                       uint8_t* out, int width) {
+static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+    HorizontalUnfilter(NULL, in, out, width);
  } else {
    uint8_t top = prev[0], top_left = top, left = top;
    int i;
    for (i = 0; i < width; ++i) {
      top = prev[i];  // need to read this first, in case prev==dst
-      left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
+      left = in[i] + GradientPredictor(left, top, top_left);
      top_left = top;
      out[i] = left;
    }
@@ -386,13 +379,13 @@ static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;

-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
 }

 #else  // !WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/filters_msa.c
+++ b/src/dsp/filters_msa.c
@@ -11,11 +11,11 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_MSA)

-#include "src/dsp/msa_macro.h"
+#include "./msa_macro.h"

 #include <assert.h>

@@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 // Horrizontal filter

-static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
-                                 int stride, uint8_t* filtered_data) {
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
  const uint8_t* preds = data;
  const uint8_t* in = data;
  uint8_t* out = filtered_data;
@@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 }


-static void GradientFilter_MSA(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
  const uint8_t* in = data;
  const uint8_t* preds = data;
  uint8_t* out = filtered_data;
@@ -157,8 +157,8 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height,
 //------------------------------------------------------------------------------
 // Vertical filter

-static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
  const uint8_t* in = data;
  const uint8_t* preds = data;
  uint8_t* out = filtered_data;
@@ -190,9 +190,9 @@ static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
 extern void VP8FiltersInitMSA(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
 }

 #else  // !WEBP_USE_MSA
--- a/src/dsp/filters_neon.c
+++ b/src/dsp/filters_neon.c
@@ -11,12 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_NEON)

 #include <assert.h>
-#include "src/dsp/neon.h"
+#include "./neon.h"

 //------------------------------------------------------------------------------
 // Helpful macros.
@@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
 }

 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
+                               int stride, uint8_t* filtered_data) {
  DoVerticalFilter_NEON(data, width, height, stride, 0, height,
                        filtered_data);
 }
@@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
 }

 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
+                               int stride, uint8_t* filtered_data) {
  DoGradientFilter_NEON(data, width, height, stride, 0, height,
                        filtered_data);
 }
@@ -251,11 +251,9 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
 // GradientUnfilter_NEON is correct but slower than the C-version,
 // at least on ARM64. For armv7, it's a wash.
 // So best is to disable it for now, but keep the idea around...
-#if !defined(USE_GRADIENT_UNFILTER)
-#define USE_GRADIENT_UNFILTER 0   // ALTERNATE_CODE
-#endif
+// #define USE_GRADIENT_UNFILTER

-#if (USE_GRADIENT_UNFILTER == 1)
+#if defined(USE_GRADIENT_UNFILTER)
 #define GRAD_PROCESS_LANE(L)  do {                                             \
  const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
  const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
@@ -294,7 +292,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
 #undef GRAD_PROCESS_LANE

 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
-                                  uint8_t* out, int width) {
+                                 uint8_t* out, int width) {
  if (prev == NULL) {
    HorizontalUnfilter_NEON(NULL, in, out, width);
  } else {
@@ -313,7 +311,7 @@ extern void VP8FiltersInitNEON(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
-#if (USE_GRADIENT_UNFILTER == 1)
+#if defined(USE_GRADIENT_UNFILTER)
  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
 #endif

--- a/src/dsp/filters_sse2.c
+++ b/src/dsp/filters_sse2.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "src/dsp/dsp.h"
+#include "./dsp.h"

 #if defined(WEBP_USE_SSE2)

@@ -24,16 +24,16 @@
 // Helpful macro.

 # define SANITY_CHECK(in, out)                                                 \
-  assert((in) != NULL);                                                        \
-  assert((out) != NULL);                                                       \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
  assert(width > 0);                                                           \
  assert(height > 0);                                                          \
  assert(stride >= width);                                                     \
  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
  (void)height;  // Silence unused warning.

-static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
-                                uint8_t* dst, int length) {
+static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
+                           uint8_t* dst, int length) {
  int i;
  const int max_pos = length & ~31;
  assert(length >= 0);
@@ -51,7 +51,7 @@ static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
 }

 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
  int i;
  const int max_pos = length & ~31;
  assert(length >= 0);
@@ -71,11 +71,10 @@ static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
 //------------------------------------------------------------------------------
 // Horizontal filter.

-static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
-                                                int width, int height,
-                                                int stride,
-                                                int row, int num_rows,
-                                                uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           uint8_t* out) {
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
@@ -85,7 +84,7 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
  if (row == 0) {
    // Leftmost pixel is the same as input for topmost scanline.
    out[0] = in[0];
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+    PredictLineLeft(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@@ -95,7 +94,7 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
  while (row < last_row) {
    // Leftmost pixel is predicted from above.
    out[0] = in[0] - in[-stride];
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+    PredictLineLeft(in + 1, out + 1, width - 1);
    ++row;
    in += stride;
    out += stride;
@@ -105,10 +104,9 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.

-static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
-                                              int width, int height, int stride,
-                                              int row, int num_rows,
-                                              uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows, uint8_t* out) {
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
@@ -119,7 +117,7 @@ static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+    PredictLineLeft(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@@ -127,7 +125,7 @@ static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,

  // Filter line-by-line.
  while (row < last_row) {
-    PredictLineTop_SSE2(in, in - stride, out, width);
+    PredictLineTop(in, in - stride, out, width);
    ++row;
    in += stride;
    out += stride;
@@ -137,14 +135,14 @@ static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Gradient filter.

-static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
  const int g = a + b - c;
  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }

-static void GradientPredictDirect_SSE2(const uint8_t* const row,
-                                       const uint8_t* const top,
-                                       uint8_t* const out, int length) {
+static void GradientPredictDirect(const uint8_t* const row,
+                                  const uint8_t* const top,
+                                  uint8_t* const out, int length) {
  const int max_pos = length & ~7;
  int i;
  const __m128i zero = _mm_setzero_si128();
@@ -163,14 +161,14 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
    _mm_storel_epi64((__m128i*)(out + i), H);
  }
  for (; i < length; ++i) {
-    out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
  }
 }

-static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
-                                              int width, int height, int stride,
-                                              int row, int num_rows,
-                                              uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         uint8_t* out) {
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
@@ -180,7 +178,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+    PredictLineLeft(in + 1, out + 1, width - 1);
    row = 1;
    in += stride;
    out += stride;
@@ -189,7 +187,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
  // Filter line-by-line.
  while (row < last_row) {
    out[0] = in[0] - in[-stride];
-    GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
+    GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
    ++row;
    in += stride;
    out += stride;
@@ -200,27 +198,26 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,

 //------------------------------------------------------------------------------

-static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
-                                  int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
-                          filtered_data);
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
 }

-static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
 }

-static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
-  DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
 }

 //------------------------------------------------------------------------------
 // Inverse transforms

-static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
-                                    uint8_t* out, int width) {
+static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
  int i;
  __m128i last;
  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@@ -241,10 +238,10 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
 }

-static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
-                                  uint8_t* out, int width) {
+static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter_SSE2(NULL, in, out, width);
+    HorizontalUnfilter(NULL, in, out, width);
  } else {
    int i;
    const int max_pos = width & ~31;
@@ -263,9 +260,9 @@ static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
  }
 }

-static void GradientPredictInverse_SSE2(const uint8_t* const in,
-                                        const uint8_t* const top,
-                                        uint8_t* const row, int length) {
+static void GradientPredictInverse(const uint8_t* const in,
+                                   const uint8_t* const top,
+                                   uint8_t* const row, int length) {
  if (length > 0) {
    int i;
    const int max_pos = length & ~7;
@@ -296,18 +293,18 @@ static void GradientPredictInverse_SSE2(const uint8_t* const in,
      _mm_storel_epi64((__m128i*)&row[i], out);
    }
    for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
    }
  }
 }

-static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
-                                  uint8_t* out, int width) {
+static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
  if (prev == NULL) {
-    HorizontalUnfilter_SSE2(NULL, in, out, width);
+    HorizontalUnfilter(NULL, in, out, width);
  } else {
    out[0] = in[0] + prev[0];  // predict from above
-    GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
+    GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
  }
 }

@@ -317,13 +314,13 @@ static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;

-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
 }

 #else  // !WEBP_USE_SSE2
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
James Zern	0d7614fddf	README.wasm: add instructions for enabling mulhi Change-Id: I1e9dd737f06ad76f73824352291a6e129ca5ded1	2017-11-02 11:20:09 -07:00
James Zern	88692490a5	WebPMemToUint32: remove ptr cast to int this can result in an alignment hint on arm causing a SIGBUS. casting the input ptr to anything aside from its type is unnecessary for memcpy and is contrary to the intent of this function. Change-Id: I9a4d3f4be90f80cd8c3e96ccbe557e51e34cf7a5 (cherry picked from commit `04b029d236`)	2017-10-31 18:24:54 -07:00
James Zern	0af22e17d6	dec_wasm,NEON mulhi: use local vector types in cast fixes the compile with ENABLE_NEON_BUILTIN_MULHI_INT16X8 without relying on arm_neon.h when using __builtin_neon_vqdmulhq_v; the typedefs are assumed to resolve to the same underlying type. Change-Id: I8840e90d894b5045e0742030cff5e800d7d56efc	2017-10-30 20:40:48 -07:00
James Zern	08af967025	add LOCAL_CLANG_PREREQ and avoid WORK_AROUND_GCC w/3.8+ this results in a 15-20% speedup for lossy decoding on a N5/S6/CM1 BUG=webp:339 Change-Id: Icdeb84c3e0b8908147ac276b4d8f76c3d565b735 (cherry picked from commit `f78da3dea6`)	2017-10-28 11:49:18 -07:00
James Zern	a26996116f	define WEBP_USE_INTRINSICS w/gcc-4.9+ 32-bit builds are neutral to slightly faster using ndk r15c on a N5/S6/CM1 BUG=webp:339 Change-Id: I94b9442e0ceaf2f5edb2b4026bc8b99cd77c918b (cherry picked from commit `01c426f1e7`)	2017-10-28 11:49:18 -07:00
James Zern	5505a5b107	Android.mk,mips: fix clang build with r15 -integrated-as is now required, the opposite of r14 Change-Id: Ic478b2b3b933e66e7d159030eac29f58743eecda (cherry picked from commit `4fbdc9fb12`)	2017-10-28 11:49:18 -07:00
Scott LaVarnway	8ed24a564c	wasm: replace _mulhi_int16x8() with neon builtin BUG=webp:352 Change-Id: I5c21ee4c631fb7eccff36c7d3cd47a72badf1a89	2017-08-23 12:10:37 -07:00
James Zern	0e8c3004be	fix Android standalone toolchain build add a check for cpu-features.h and rework some of the ifdef's around android + neon. for android builds with cpu-features enabled the *_neon.c files will still need to be flagged correctly (with e.g., .c.neon in Android.mk) to properly build them. BUG=webp:353 Change-Id: I905ce305af0a204e560b915d8665093a3edaceb9 (cherry picked from commit `c6d1db4b36`)	2017-08-22 12:23:25 -07:00
Scott LaVarnway	6fcc3a720d	wasm: Add simple loopfilter BUG=webp:352 Change-Id: If51454b8f54fb14ced91fa703e86e057457f454f	2017-07-28 09:04:30 -07:00
Scott LaVarnway	2371724d79	Merge "wasm: replace _mulhi_int16x8() with x86 builtin" into portable-intrinsics	2017-07-28 11:43:05 +00:00
Scott LaVarnway	9d1e151bf5	wasm: replace _mulhi_int16x8() with x86 builtin BUG=webp:352 Change-Id: Ie51a8bb68211d7778610a818bba02e7455e47e23	2017-07-27 15:03:52 -07:00
James Zern	a1ab868427	README.wasm: add android build instructions Change-Id: I819dc373b20b2d0255bf396de5e9b467166bb9c2	2017-07-19 12:48:29 -07:00
Scott LaVarnway	0b8ecc8cfa	wasm: Rename _pack_sw_2_sb to _pack_epi16_to_epi8 BUG=webp:352 Change-Id: I67beb9297d3a37f3691779e0ad6f557f06195219	2017-07-18 13:34:21 -07:00
James Zern	c646241391	README.wasm: normalize cmake command lines + enable NEON for armv7 Change-Id: If914f3973391271a1817e7ff16541f3272b2ad88	2017-07-18 11:09:07 -07:00
James Zern	cfaebe3e95	dec_wasm,armv7: set ENABLE_NEON_BUILTIN_ADDSUB_SAT this path works with -march=armv7a -mfpu=neon with clang Change-Id: I0c04ab9a195c353aa2bfaf3ba2ca8a21e68ee5e1	2017-07-17 22:15:38 -07:00
James Zern	c0eb3ff7d4	dec_wasm,x86: define ENABLE_X86_BUILTIN_ADDSUB_SAT this matches the behavior for aarch64 Change-Id: I243ca6cc5ffec107065a7e0dc442ee69322936ae	2017-07-15 11:54:25 -07:00
Scott LaVarnway	415b98ffad	wasm: Enable neon add,sub with sat builtins For performance testing BUG=webp:352 Change-Id: I3fab48ee610437ac07dd603208972edf17c6f50b	2017-07-14 15:11:57 -07:00
James Zern	09bcd9a397	Merge "wasm: Replace 5 __builtin_shufflevector()" into portable-intrinsics	2017-07-11 22:57:32 +00:00
Scott LaVarnway	e83df9d208	wasm: replace #if with #ifdef BUG=webp:352 Change-Id: Ib173e8cdb9077b385141c0e3a2f7ceb1a183b89e	2017-07-11 13:18:15 -07:00
Scott LaVarnway	3387fb6fa6	wasm: Replace 5 __builtin_shufflevector() with _unpack*() BUG=webp:352 Change-Id: Iea17286260afe7d242c91b02d888c5af59ab0cb7	2017-07-11 13:06:24 -07:00
Scott LaVarnway	599bddb658	Merge "wasm: cleanup _pack_sw_2_sb(), SignedShift8b()" into portable-intrinsics	2017-07-11 19:27:57 +00:00
Scott LaVarnway	28fbe808b9	Merge "wasm: Add VFilter8i and HFilter8i" into portable-intrinsics	2017-07-11 18:53:51 +00:00
Scott LaVarnway	c396e6701b	wasm: cleanup _pack_sw_2_sb(), SignedShift8b() BUG=webp:352 Change-Id: I35a36e1841f71f286fa7e032866a878b52ba56ba	2017-07-11 08:48:49 -07:00
Scott LaVarnway	96ef09107c	Merge "wasm: Generic add,sub saturate" into portable-intrinsics	2017-07-10 22:52:40 +00:00
James Zern	bc01db116f	Merge "add README.wasm" into portable-intrinsics	2017-07-10 22:15:16 +00:00
James Zern	d2adc08095	add README.wasm describes how to build dwebp using portable intrinsics BUG=webp:352 Change-Id: Ibd39dd156d6b64b52f9ade871b969a070bc9ab61	2017-07-10 22:15:00 +00:00
Scott LaVarnway	d6f90a3d83	wasm: Add VFilter8i and HFilter8i BUG=webp:352 Change-Id: I76bac76d55ba2ac62bac479ee60291acacfc491c	2017-07-10 13:19:58 -07:00
Scott LaVarnway	cd01fc3944	wasm: Add VFilter8 and HFilter8 BUG=webp:352 Change-Id: Ic43eb5edd7e2508d2975c8f4f72cdba60c4b70f7	2017-07-10 11:54:39 -07:00
Scott LaVarnway	9a1a3aa827	wasm: Generic add,sub saturate BUG=webp:352 Change-Id: I2640df736ea852ca5adcb5048550e26475c777ef	2017-07-10 11:46:53 -07:00
Scott LaVarnway	9eceff25c0	wasm: Add VFilter16i and HFilter16i BUG=webp:352 Change-Id: If2cf5737d7aeab9e2d7205adfd334e3041a83c9f	2017-07-10 06:31:00 -07:00
Scott LaVarnway	fe6184d706	wasm: Add HFilter16 BUG=webp:352 Change-Id: I8f1124b36bb5769790b0e7b193acb775e8f463b1	2017-07-07 10:52:48 -07:00
James Zern	cb6c3a2a36	cosmetics,dec_wasm: constify function params Change-Id: I10f2f612b553dc2c8282f4a7d4176f645aba97c9	2017-07-06 23:23:58 -07:00
James Zern	ec666c7526	cmake: split gif detection from IMG deps gifdec isn't part of imageio lib, it's only used by gif2webp. Change-Id: I70bff378a32f8fb2ebb8a5a7701049ffff7f7992 (cherry picked from commit `dcbc1c881a`)	2017-07-07 04:11:49 +00:00
Scott LaVarnway	bafa90ccd8	wasm: Add VFilter16 BUG=webp:352 Change-Id: I97f38aee5de063957c1512f6bd429c0e84c02087	2017-07-06 11:21:53 -07:00
Scott LaVarnway	e6e3ec335c	wasm: Add Truemotion BUG=webp:352 Change-Id: Ie65e155ac2d8253a5706ee85e830ec220a12ab4b	2017-07-05 15:56:01 -07:00
Scott LaVarnway	168a3a9e28	Merge "wasm: Add Transform" into portable-intrinsics	2017-07-05 19:50:31 +00:00
Scott LaVarnway	ad4ca27449	wasm: Add Transform BUG=webp:352 Change-Id: Ib119b4121c6fd1a5165088dd132b4ab2aca627a5	2017-07-05 12:16:05 -07:00
Scott LaVarnway	3a5528713b	wasm: Add VR4, LD4, and VL4 BUG=webp:352 Change-Id: I28ec852da8aef65b7f3c372c08be5c6bf68256be	2017-07-03 05:36:24 -07:00
Scott LaVarnway	b4cefba731	wasm: Add VE4 and RD4 BUG=webp:352 Change-Id: I24286685d7c002ec55534a98bcb88ecd82562f79	2017-06-30 11:08:43 -07:00
Scott LaVarnway	440945ca57	wasm: Add DC8uv*, VE8uv, and HE8uv functions BUG=webp:352 Change-Id: Ia0b2b0d5007c12fff201ac94673312420da42a53	2017-06-30 02:14:12 +00:00
Scott LaVarnway	a37a7b00d5	wasm: Add DC16*, VE16, and HE16 functions BUG=webp:352 Change-Id: Ia003257d00c2c2ea16a6e6344671237e78c0eac6	2017-06-29 21:48:14 +00:00
James Zern	a604ab5600	cpu.cmake: skip simd disable flag check w/wasm this workarounds an ICE with clang. WASM overrides the native simd so apart from binary size building it isn't an issue. BUG=webp:350 Change-Id: Ib0195049249e6cb86d3225ce5db7247ac22cdbd6	2017-06-28 19:34:24 -07:00
James Zern	b005d916f8	dsp/cpu: correct wasmCPUInfo placement WASM needs to precede platform specific architecture checks as the defines will still be present when building this target. Change-Id: I823f4922829561ea298c6837068b79bf9f1aee1b	2017-06-26 16:05:51 -07:00
James Zern	586eda373d	Revert "dsp/cpu: correct wasmCPUInfo placement" This reverts commit `4026e34e3f`. fails to build; not all x86 paths were protected Change-Id: I27bcc83e74440205bfd99c31c6da319c205ef145	2017-06-26 15:49:20 -07:00
James Zern	4026e34e3f	dsp/cpu: correct wasmCPUInfo placement WASM needs to precede platform specific architecture checks as the defines will still be present when building this target. Change-Id: If25467ea286e582b928e26e716e41aff72898c50	2017-06-26 12:12:46 -07:00
James Zern	4b21971337	add dec_wasm.c stub + basic cmake support for targeting native code generation using portable intrinsics / wasm (WebAssembly). integrating this into the webp_js path will be left until the implementation is more complete. Change-Id: I3e751b511f6d671da5ba8afc88ca412f31f097b0	2017-06-22 23:21:31 -07:00