Merge "Remove now unused ExtraCostCombined" into main

Remove now unused ExtraCostCombined
Change-Id: Ic9d1ccf5b10fed67f836aa19fa0f84238acbf4c1
2025-04-10 02:46:47 +02:00 · 2025-04-01 00:28:47 -07:00 · 2025-03-29 23:34:20 +01:00 · 2025-03-28 12:37:24 -07:00 · 2025-03-28 15:00:41 +01:00 · 2025-03-28 11:44:03 +01:00
206 changed files with 15748 additions and 4588 deletions
--- a/.gitignore
+++ b/.gitignore
@ -52,5 +52,6 @@ tests/fuzzer/animdecoder_fuzzer
 tests/fuzzer/animencoder_fuzzer
 tests/fuzzer/demux_api_fuzzer
 tests/fuzzer/enc_dec_fuzzer
+tests/fuzzer/huffman_fuzzer
 tests/fuzzer/mux_demux_api_fuzzer
 tests/fuzzer/simple_api_fuzzer
--- a/10
+++ b/10
@ -2,6 +2,8 @@ Contributors:
 - Aidan O'Loan (aidanol at gmail dot com)
 - Alan Browning (browning at google dot com)
 - Alexandru Ardelean (ardeleanalex at gmail dot com)
+- Anuraag Agrawal (anuraaga at gmail dot com)
+- Arthur Eubanks (aeubanks at google dot com)
 - Brian Ledger (brianpl at google dot com)
 - Charles Munger (clm at google dot com)
 - Cheng Yi (cyi at google dot com)
@ -9,16 +11,20 @@ Contributors:
 - Christopher Degawa (ccom at randomderp dot com)
 - Clement Courbet (courbet at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
+- Frank (1433351828 at qq dot com)
 - Frank Barchard (fbarchard at google dot com)
 - Hui Su (huisu at google dot com)
 - H. Vetinari (h dot vetinari at gmx dot com)
 - Ilya Kurdyukov (jpegqs at gmail dot com)
 - Ingvar Stepanyan (rreverser at google dot com)
+- Istvan Stefan (Istvan dot Stefan at arm dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jehan (jehan at girinstud dot io)
 - Jeremy Maitin-Shepard (jbms at google dot com)
 - Johann Koenig (johann dot koenig at duck dot com)
+- Jonathan Grant (jgrantinfotech at gmail dot com)
+- Jonliu1993 (13720414433 at 163 dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
 - Konstantin Ivlev (tomskside at gmail dot com)
@ -28,13 +34,16 @@ Contributors:
 - Marcin Kowalczyk (qrczak at google dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Maryla Ustarroz-Calonge (maryla at google dot com)
+- Masahiro Hanada (hanada at atmark-techno dot com)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Mislav Bradac (mislavm at google dot com)
+- natewood (natewood at fb dot com)
 - Nico Weber (thakis at chromium dot org)
 - Noel Chromium (noel at chromium dot org)
 - Nozomi Isozaki (nontan at pixiv dot co dot jp)
 - Oliver Wolff (oliver dot wolff at qt dot io)
 - Owen Rodley (orodley at google dot com)
+- Ozkan Sezer (sezeroz at gmail dot com)
 - Parag Salasakar (img dot mips1 at gmail dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
@ -55,6 +64,7 @@ Contributors:
 - Vincent Rabaud (vrabaud at google dot com)
 - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
 - Wan-Teh Chang (wtc at google dot com)
+- wrv (wrv at utexas dot edu)
 - Yang Zhang (yang dot zhang at arm dot com)
 - Yannis Guyon (yguyon at google dot com)
 - Zhi An Ng (zhin at chromium dot org)
--- a/Android.mk
+++ b/Android.mk
@ -164,6 +164,7 @@ utils_dec_srcs := \
    src/utils/color_cache_utils.c \
    src/utils/filters_utils.c \
    src/utils/huffman_utils.c \
+    src/utils/palette.c \
    src/utils/quant_levels_dec_utils.c \
    src/utils/random_utils.c \
    src/utils/rescaler_utils.c \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,11 +9,7 @@
 if(APPLE)
  cmake_minimum_required(VERSION 3.17)
 else()
-  cmake_minimum_required(VERSION 3.7)
-endif()
-
-if(POLICY CMP0072)
-  cmake_policy(SET CMP0072 NEW)
+  cmake_minimum_required(VERSION 3.16)
 endif()

 project(WebP C)
@ -45,12 +41,15 @@ option(WEBP_BUILD_LIBWEBPMUX "Build the libwebpmux library." ON)
 option(WEBP_BUILD_WEBPMUX "Build the webpmux command line tool." ON)
 option(WEBP_BUILD_EXTRAS "Build extras." ON)
 option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
+option(WEBP_BUILD_FUZZTEST "Build the fuzztest tests." OFF)
 option(WEBP_USE_THREAD "Enable threading support" ON)
 option(WEBP_NEAR_LOSSLESS "Enable near-lossless encoding" ON)
 option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces."
       OFF)
 set(WEBP_BITTRACE "0" CACHE STRING "Bit trace mode (0=none, 1=bit, 2=bytes)")
 set_property(CACHE WEBP_BITTRACE PROPERTY STRINGS 0 1 2)
+option(WEBP_ENABLE_WUNUSED_RESULT "Add [[nodiscard]] to some functions. \
+       CMake must be at least 3.21 to force C23" OFF)

 if(WEBP_LINK_STATIC)
  if(WIN32)
@ -133,7 +132,7 @@ if(WEBP_UNICODE)
  add_definitions(-DUNICODE -D_UNICODE)
 endif()

-if(MSVC AND BUILD_SHARED_LIBS)
+if(WIN32 AND BUILD_SHARED_LIBS)
  add_definitions(-DWEBP_DLL)
 endif()

@ -161,7 +160,20 @@ if(MSVC)
  set(CMAKE_STATIC_LIBRARY_PREFIX "${webp_libname_prefix}")
 endif()

-set(CMAKE_C_VISIBILITY_PRESET hidden)
+if(NOT WIN32)
+  set(CMAKE_C_VISIBILITY_PRESET hidden)
+endif()
+
+if(WEBP_ENABLE_WUNUSED_RESULT)
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.21.0)
+    set(CMAKE_C_STANDARD 23)
+  else()
+    unset(CMAKE_C_STANDARD)
+    add_compile_options($<$<COMPILE_LANGUAGE:C>:-std=gnu2x>)
+  endif()
+  add_compile_options(-Wunused-result)
+  add_definitions(-DWEBP_ENABLE_NODISCARD=1)
+endif()

 # ##############################################################################
 # Android only.
@ -360,9 +372,11 @@ if(XCODE)
 endif()
 target_link_libraries(webpdecoder ${WEBP_DEP_LIBRARIES})
 target_include_directories(
-  webpdecoder PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
-  INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  webpdecoder
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
+  INTERFACE
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR};${CMAKE_CURRENT_BINARY_DIR}>"
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 set_target_properties(
  webpdecoder
  PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h;\
@ -463,6 +477,8 @@ endif()
 if(WEBP_BUILD_ANIM_UTILS
   OR WEBP_BUILD_CWEBP
   OR WEBP_BUILD_DWEBP
+   OR WEBP_BUILD_EXTRAS
+   OR WEBP_BUILD_FUZZTEST
   OR WEBP_BUILD_GIF2WEBP
   OR WEBP_BUILD_IMG2WEBP
   OR WEBP_BUILD_VWEBP
@ -499,6 +515,8 @@ if(WEBP_BUILD_ANIM_UTILS
    TARGET exampleutil imageioutil imagedec imageenc
    PROPERTY INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/src
             ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(imagedec PRIVATE ${WEBP_DEP_IMG_INCLUDE_DIRS})
+  target_include_directories(imageenc PRIVATE ${WEBP_DEP_IMG_INCLUDE_DIRS})
 endif()

 if(WEBP_BUILD_DWEBP)
@ -545,7 +563,8 @@ if(WEBP_BUILD_GIF2WEBP)
  add_executable(gif2webp ${GIF2WEBP_SRCS})
  target_link_libraries(gif2webp exampleutil imageioutil webp libwebpmux
                        ${WEBP_DEP_GIF_LIBRARIES})
-  target_include_directories(gif2webp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(gif2webp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src
+                                              ${CMAKE_CURRENT_SOURCE_DIR})
  install(TARGETS gif2webp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

@ -638,15 +657,30 @@ if(WEBP_BUILD_EXTRAS)
                                                  ${CMAKE_CURRENT_BINARY_DIR})

  # vwebp_sdl
-  find_package(SDL)
-  if(WEBP_BUILD_VWEBP AND SDL_FOUND)
+  find_package(SDL2 QUIET)
+  if(WEBP_BUILD_VWEBP AND SDL2_FOUND)
    add_executable(vwebp_sdl ${VWEBP_SDL_SRCS})
-    target_link_libraries(vwebp_sdl ${SDL_LIBRARY} imageioutil webp)
+    target_link_libraries(vwebp_sdl ${SDL2_LIBRARIES} imageioutil webp)
    target_include_directories(
      vwebp_sdl PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
-                        ${CMAKE_CURRENT_BINARY_DIR}/src ${SDL_INCLUDE_DIR})
+                        ${CMAKE_CURRENT_BINARY_DIR}/src ${SDL2_INCLUDE_DIRS})
    set(WEBP_HAVE_SDL 1)
    target_compile_definitions(vwebp_sdl PUBLIC WEBP_HAVE_SDL)
+
+    set(CMAKE_REQUIRED_INCLUDES "${SDL2_INCLUDE_DIRS}")
+    check_c_source_compiles(
+      "
+        #define SDL_MAIN_HANDLED
+        #include \"SDL.h\"
+        int main(void) {
+          return 0;
+        }
+      "
+      HAVE_JUST_SDL_H)
+    set(CMAKE_REQUIRED_INCLUDES)
+    if(HAVE_JUST_SDL_H)
+      target_compile_definitions(vwebp_sdl PRIVATE WEBP_HAVE_JUST_SDL_H)
+    endif()
  endif()
 endif()

@ -661,31 +695,44 @@ if(WEBP_BUILD_WEBP_JS)
  else()
    set(emscripten_stack_size "-sTOTAL_STACK=5MB")
  endif()
+  find_package(SDL2 REQUIRED)
  # wasm2js does not support SIMD.
  if(NOT WEBP_ENABLE_SIMD)
    # JavaScript version
    add_executable(webp_js ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
-    target_link_libraries(webp_js webpdecoder SDL)
+    target_link_libraries(webp_js webpdecoder SDL2)
    target_include_directories(webp_js PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
    set(WEBP_HAVE_SDL 1)
    set_target_properties(
      webp_js
-      PROPERTIES LINK_FLAGS "-sWASM=0 ${emscripten_stack_size} \
+      PROPERTIES
+        # Emscripten puts -sUSE_SDL2=1 in this variable, though it's needed at
+        # compile time to ensure the headers are downloaded.
+        COMPILE_OPTIONS "${SDL2_LIBRARIES}"
+        LINK_FLAGS
+        "-sWASM=0 ${emscripten_stack_size} \
         -sEXPORTED_FUNCTIONS=_WebPToSDL -sINVOKE_RUN=0 \
-         -sEXPORTED_RUNTIME_METHODS=cwrap")
+         -sEXPORTED_RUNTIME_METHODS=cwrap ${SDL2_LIBRARIES} \
+         -sALLOW_MEMORY_GROWTH")
    set_target_properties(webp_js PROPERTIES OUTPUT_NAME webp)
    target_compile_definitions(webp_js PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)
  endif()

  # WASM version
  add_executable(webp_wasm ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
-  target_link_libraries(webp_wasm webpdecoder SDL)
+  target_link_libraries(webp_wasm webpdecoder SDL2)
  target_include_directories(webp_wasm PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
  set_target_properties(
    webp_wasm
-    PROPERTIES LINK_FLAGS "-sWASM=1 ${emscripten_stack_size} \
+    PROPERTIES
+      # Emscripten puts -sUSE_SDL2=1 in this variable, though it's needed at
+      # compile time to ensure the headers are downloaded.
+      COMPILE_OPTIONS "${SDL2_LIBRARIES}"
+      LINK_FLAGS
+      "-sWASM=1 ${emscripten_stack_size} \
       -sEXPORTED_FUNCTIONS=_WebPToSDL -sINVOKE_RUN=0 \
-       -sEXPORTED_RUNTIME_METHODS=cwrap")
+       -sEXPORTED_RUNTIME_METHODS=cwrap ${SDL2_LIBRARIES} \
+       -sALLOW_MEMORY_GROWTH")
  target_compile_definitions(webp_wasm PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)

  target_compile_definitions(webpdspdecode PUBLIC EMSCRIPTEN)
@ -725,6 +772,10 @@ if(WEBP_BUILD_ANIM_UTILS)
  target_include_directories(anim_dump PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
 endif()

+if(WEBP_BUILD_FUZZTEST)
+  add_subdirectory(tests/fuzzer)
+endif()
+
 # Install the different headers and libraries.
 install(
  TARGETS ${INSTALLED_LIBRARIES}
--- a/296
+++ b/296
@ -1,17 +1,309 @@
+c3d85ce4 update NEWS
+ad14e811 tests/fuzzer/*: add missing <string_view> include
+74cd026e fuzz_utils.cc: fix build error w/WEBP_REDUCE_SIZE
+a027aa93 mux_demux_api_fuzzer.cc: fix -Wshadow warning
+25e17c68 update ChangeLog (tag: v1.5.0-rc1)
+aa2684fc update NEWS
+36923846 bump version to 1.5.0
+ceea8ff6 update AUTHORS
+e4f7a9f0 img2webp: add a warning for unused options
+1b4c967f Merge "Properly check the data size against the end of the RIFF chunk" into main
+9e5ecfaf Properly check the data size against the end of the RIFF chunk
+da0d9c7d examples: exit w/failure w/no args
+fcff86c7 {gif,img}2webp: sync -m help w/cwebp
+b76c4a84 man/img2webp.1: sync -m text w/cwebp.1 & gif2webp.1
+30633519 muxread: fix reading of buffers > riff size
+4c85d860 yuv.h: update RGB<->YUV coefficients in comment
+0ab789e0 Merge changes I6dfedfd5,I2376e2dc into main
+03236450 {ios,xcframework}build.sh: fix compilation w/Xcode 16
+61e2cfda rework AddVectorEq_SSE2
+7bda3deb rework AddVector_SSE2
+2ddaaf0a Fix variable names in SharpYuvComputeConversionMatrix
+a3ba6f19 Makefile.vc: fix gif2webp link error
+f999d94f gif2webp: add -sharp_yuv/-near_lossless
+dfdcb7f9 Merge "lossless.h: fix function declaration mismatches" into main (tag: webp-rfc9649)
+78ed6839 fix overread in Intra4Preds_NEON
+d516a68e lossless.h: fix function declaration mismatches
+87406904 Merge "Improve documentation of SharpYuvConversionMatrix." into main
+fdb229ea Merge changes I07a7e36a,Ib29980f7,I2316122d,I2356e314,I32b53dd3, ... into main
+0c3cd9cc Improve documentation of SharpYuvConversionMatrix.
+169dfbf9 disable Intra4Preds_NEON
+2dd5eb98 dsp/yuv*: use WEBP_RESTRICT qualifier
+23bbafbe dsp/upsampling*: use WEBP_RESTRICT qualifier
+35915b38 dsp/rescaler*: use WEBP_RESTRICT qualifier
+a32b436b dsp/lossless*: use WEBP_RESTRICT qualifier
+04d4b4f3 dsp/filters*: use WEBP_RESTRICT qualifier
+b1cb37e6 dsp/enc*: use WEBP_RESTRICT qualifier
+201894ef dsp/dec*: use WEBP_RESTRICT qualifier
+02eac8a7 dsp/cost*: use WEBP_RESTRICT qualifier
+84b118c9 Merge "webp-container-spec: normalize notes & unknown chunk link" into main
+052cf42f webp-container-spec: normalize notes & unknown chunk link
+220ee529 Search for best predictor transform bits
+78619478 Try to reduce the sampling for the entropy image
+14f09ab7 webp-container-spec: reorder chunk size - N text
+a78c5356 Remove a useless malloc for entropy image
+bc491763 Merge "Refactor predictor finding" into main
+34f92238 man/{cwebp,img2webp}.1: rm 'if needed' from -sharp_yuv
+367ca938 Refactor predictor finding
+a582b53b webp-lossless-bitstream-spec: clarify some text
+0fd25d84 Merge "anim_encode.c: fix function ref in comment" into main
+f8882913 anim_encode.c: fix function ref in comment
+40e4ca60 specs_generation.md: update kramdown command line
+57883c78 img2webp: add -exact/-noexact per-frame options
+1c8eba97 img2webp,cosmetics: add missing '.' spacers to help
+2e81017c Convert predictor_enc.c to fixed point
+94de6c7f Merge "Fix fuzztest link errors w/-DBUILD_SHARED_LIBS=1" into main
+51d9832a Fix fuzztest link errors w/-DBUILD_SHARED_LIBS=1
+7bcb36b8 Merge "Fix static overflow warning." into main
+8e0cc14c Fix static overflow warning.
+cea68462 README.md: add security report note
+615e5874 Merge "make VP8LPredictor[01]_C() static" into main
+233e86b9 Merge changes Ie43dc5ef,I94cd8bab into main
+1a29fd2f make VP8LPredictor[01]_C() static
+dd9d3770 Do*Filter_*: remove row & num_rows parameters
+ab451a49 Do*Filter_C: remove dead 'inverse' code paths
+f9a480f7 {TrueMotion,TM16}_NEON: remove zero extension
+04834aca Merge changes I25c30a9e,I0a192fc6,I4cf89575 into main
+39a602af webp-lossless-bitstream-spec: normalize predictor transform ref
+f28c837d Merge "webp-container-spec: align anim pseudocode w/prose" into main
+74be8e22 Fix implicit conversion issues
+0c01db7c Merge "Increase the transform bits if possible." into main
+f2d6dc1e Increase the transform bits if possible.
+caa19e5b update link to issue tracker
+c9dd9bd4 webp-container-spec: align anim pseudocode w/prose
+8a7c8dc6 WASM: Enable VP8L_USE_FAST_LOAD
+f0c53cd9 WASM: don't use USE_GENERIC_TREE
+eef903d0 WASM: Enable 64-bit BITS caching
+6296cc8d iterator_enc: make VP8IteratorReset() static
+fbd93896 histogram_enc: make VP8LGetHistogramSize static
+cc7ff545 cost_enc: make VP8CalculateLevelCosts[] static
+4e2828ba vp8l_dec: make VP8LClear() static
+d742b24a Intra16Preds_NEON: fix truemotion saturation
+c7bb4cb5 Intra4Preds_NEON: fix truemotion saturation
+952a989b Merge "Remove TODO now that log is using fixed point." into main
+dde11574 Remove TODO now that log is using fixed point.
+a1ca153d Fix hidden myerr in my_error_exit
+3bd94202 Merge changes Iff6e47ed,I24c67cd5,Id781e761 into main
+d27d246e Merge "Convert VP8LFastSLog2 to fixed point" into main
+4838611f Disable msg_code use in fuzzing mode
+314a142a Use QuantizeBlock_NEON for VP8EncQuantizeBlockWHT on Arm
+3bfb05e3 Add AArch64 Neon implementation of Intra16Preds
+baa93808 Add AArch64 Neon implementation of Intra4Preds
+41a5e582 Fix errors when compiling code as C++
+fb444b69 Convert VP8LFastSLog2 to fixed point
+c1c89f51 Fix WEBP_NODISCARD comment and C++ version
+66408c2c Switch the histogram_enc.h API to fixed point
+ac1e410d Remove leftover tiff dep
+b78d3957 Disable TIFF on fuzztest.
+cff21a7d Do not build statically on oss-fuzz.
+6853a8e5 Merge "Move more internal fuzzers to public." into main
+9bc09db4 Merge "Convert VP8LFastLog2 to fixed point" into main
+0a9f1c19 Convert VP8LFastLog2 to fixed point
+db0cb9c2 Move more internal fuzzers to public.
+ff2b5b15 Merge "advanced_api_fuzzer.cc: use crop dims in OOM check" into main
+c4af79d0 Put 0 at the end of a palette and do not store it.
+0ec80aef Delete last references to delta palettization
+96d79f84 advanced_api_fuzzer.cc: use crop dims in OOM check
+c35c7e02 Fix huffman fuzzer to not leak.
+f2fe8dec Bump fuzztest dependency.
+9ce982fd Fix fuzz tests to work on oss-fuzz
+3ba8af1a Do not escape quotes anymore in build.sh
+ea0e121b Allow centipede to be used as a fuzzing engine.
+27731afd make VP8I4ModeOffsets & VP8MakeIntra4Preds static
+ddd6245e oss-fuzz/build.sh: use heredoc for script creation
+50074930 oss-fuzz/build.sh,cosmetics: fix indent
+20e92f7d Limit the possible fuzz engines.
+4f200de5 Switch public fuzz tests to fuzztest.
+64186bb3 Add huffman_fuzzer to .gitignore
+0905f61c Move build script from oss-fuzz repo to here.
+e8678758 Fix link to Javascript documentation
+5e5b8f0c Fix SSE2 Transform_AC3 function name
+45129ee0 Revert "Check all the rows."
+ee26766a Check all the rows.
+7ec51c59 Increase the transform bits if possible.
+3cd16fd3 Revert "Increase the transform bits if possible."
+971a03d8 Increase the transform bits if possible.
+1bf198a2 Allow transform_bits to be different during encoding.
+1e462ca8 Define MAX_TRANSFORM_BITS according to the specification.
+64d1ec23 Use (MIN/NUM)_(TRANSFORM/HUFFMAN)_BITS where appropriate
+a90160e1 Refactor histograms in predictors.
+a7aa7525 Fix some function declarations
+68ff4e1e Merge "jpegdec: add a hint for EOF/READ errors" into main
+79e7968a jpegdec: add a hint for EOF/READ errors
+d33455cd man/*: s/BUGS/REPORTING BUGS/
+a67ff735 normalize example exit status
+edc28909 upsampling_{neon,sse41}: fix int sanitizer warning
+3cada4ce ImgIoUtilReadFile: check ftell() return
+dc950585 Merge tag 'v1.4.0'
+845d5476 update ChangeLog (tag: v1.4.0, origin/1.4.0)
+8a6a55bb update NEWS
+cf7c5a5d provide a way to opt-out/override WEBP_NODISCARD
+cc34288a update ChangeLog (tag: v1.4.0-rc1)
+f13c0886 NEWS: fix date
+74555950 Merge "vwebp: fix window title when options are given" into 1.4.0
+d781646c vwebp: fix window title when options are given
+c2e394de update NEWS
+f6d15cb7 bump version to 1.4.0
+57c388b8 update AUTHORS
+b3d1b2cb Merge changes I26f4aa22,I83386b6c,I320ed1a2 into main
+07216886 webp-container-spec: fix VP8 chunk ref ('VP8'->'VP8 ')
+f88666eb webp_js/*.html: fix canvas mapping
+e2c8f233 cmake,wasm: simplify SDL2 related flags
+d537cd37 cmake: fix vwebp_sdl compile w/libsdl-org release
+6c484cbf CMakeLists.txt: add missing WEBP_BUILD_EXTRAS check
+7b0bc235 man/cwebp.1: add more detail to -partition_limit
+3c0011bb WebPMuxGetChunk: add an assert
+955a3d14 Merge "muxread,MuxGet: add an assert" into main
+00abc000 muxread,MuxGet: add an assert
+40e85a0b Have the window title reflect the filename.
+1bf46358 man/cwebp.1: clarify -pass > 1 behavior w/o -size/-psnr
+eba03acb webp-container-spec: replace 'above' with 'earlier'
+a16d30cb webp-container-spec: clarify chunk order requirements
+8a7e9112 Merge "CMakeLists.txt: apply cmake-format" into main
+7fac6c1b Merge "Copy C code to not have multiplication overflow" into main
+e2922e43 Merge "Check for the presence of the ANDROID_ABI variable" into main
+501d9274 Copy C code to not have multiplication overflow
+fba7d62e CMakeLists.txt: apply cmake-format
+661c1b66 Merge "windows exports: use dllexport attribute, instead of visibility." into main
+8487860a windows exports: use dllexport attribute, instead of visibility.
+8ea678b9 webp/mux.h: data lifetime note w/copy_data=0
+79e05c7f Check for the presence of the ANDROID_ABI variable
+45f995a3 Expose functions for managing non-image chunks on WebPAnimEncoder
+1fb9f3dc gifdec: fix ErrorGIFNotAvailable() declaration
+4723db65 cosmetics: s/SANITY_CHECK/DCHECK/
+f4b9bc9e clear -Wextra-semi-stmt warnings
+713982b8 Limit animdecoder_fuzzer to 320MB
+cbe825e4 cmake: fix sharpyuv simd files' build
+f99305e9 Makefile.vc: add ARM64 support
+5efd6300 mv SharpYuvEstimate420Risk to extras/
+e78e924f Makefile.vc: add sharpyuv_risk_table.obj
+d7a0506d Add YUV420 riskiness metric.
+89c5b917 Merge "BuildHuffmanTable check sorted[] array bounds before writing" into main
+34c80749 Remove alpha encoding pessimization.
+13d9c30b Add a WEBP_NODISCARD
+24d7f9cb Switch code to SDL2.
+0b56dedc BuildHuffmanTable check sorted[] array bounds before writing
+a429c0de sharpyuv: convert some for() to do/while
+f0cd7861 DoSharpArgbToYuv: remove constant from loop
+339231cc SharpYuvConvertWithOptions,cosmetics: fix formatting
+307071f1 Remove medium/large code model-specific inline asm
+deadc339 Fix transfer functions where toGamma and toLinear are swapped.
+e7b78d43 Merge "Fix bug in FromLinearLog100." into main
+15a1309e Merge "webp-lossless-bitstream-spec: delete extra blank line" into main
+54ca9752 Fix bug in FromLinearLog100.
+d2cb2d8c Dereference after NULL check.
+e9d50107 webp-lossless-bitstream-spec: delete extra blank line
+78657971 Merge changes Ief442c90,Ie6e9c9a5 into main
+e30a5884 webp-lossless-bitstream-spec: update variable names
+09ca1368 Merge "webp-container-spec: change assert to MUST be TRUE" into main
+38cb4fc0 iosbuild,xcframeworkbuild: add SharpYuv framework
+40afa926 webp-lossless-bitstream-spec: simplify abstract
+9db21143 webp-container-spec: change assert to MUST be TRUE
+cdbf88ae Fix typo in API docs for incremental decoding
+05c46984 Reformat vcpkg build instructions.
+8534f539 Merge "Never send VP8_STATUS_SUSPENDED back in non-incremental." into main
+35e197bd Never send VP8_STATUS_SUSPENDED back in non-incremental.
+61441425 Add vcpkg installation instructions
+dce8397f Fix next is invalid pointer when WebPSafeMalloc fails
+57c58105 Cmake: wrong public macro WEBP_INCLUDE_DIRS
+c1ffd9ac Merge "vp8l_enc: fix non-C90 code" into main
+a3965948 Merge changes If628bb93,Ic79f6309,I45f0db23 into main
+f80e9b7e vp8l_enc: fix non-C90 code
+accd141d Update lossless spec for two simple codes.
+ac17ffff Fix non-C90 code.
+433c7dca Fix static analyzer warnings.
+5fac76cf Merge tag 'v1.3.2'
+ca332209 update ChangeLog (tag: v1.3.2)
+1ace578c update NEWS
+63234c42 bump version to 1.3.2
+a35ea50d Add a fuzzer for ReadHuffmanCodes
+95ea5226 Fix invalid incremental decoding check.
+2af26267 Fix OOB write in BuildHuffmanTable.
+902bc919 Fix OOB write in BuildHuffmanTable.
+7ba44f80 Homogenize "__asm__ volatile" vs "asm volatile"
+68e27135 webp-container-spec: reorder example chunk layout
+943b932a Merge changes I6a4d0a04,Ibc37b91e into main
+1cc94f95 decode.h: wrap idec example in /* */
+63acdd1e decode.h: fix decode example
+aac5c5d0 ReadHuffmanCode: rm redundant num code lengths check
+a2de25f6 webp-lossless-bitstream-spec: normalize list item case
+68820f0e webp-lossless-bitstream-spec: normalize pixel ref
+cdb31aa8 webp-lossless-bitstream-spec: add missing periods
+0535a8cf webp-lossless-bitstream-spec: fix grammar
+b6c4ce26 normalize numbered list item format
+dd7364c3 Merge "palette.c: fix msvc warnings" into main
+c63c5df6 palette.c: fix msvc warnings
+0a2cad51 webp-container-spec: move terms from intro section
+dd88d2ff webp-lossless-bitstream-spec: color_cache -> color cache
+6e750547 Merge changes I644d7d39,Icf05491e,Ic02e6652,I63b11258 into main
+67a7cc2b webp-lossless-bitstream-spec: fix code blocks
+1432ebba Refactor palette sorting computation.
+cd436142 webp-lossless-bitstream-spec: block -> chunk
+3cb66f64 webp-lossless-bitstream-spec: add some missing commas
+56471a53 webp-lossless-bitstream-spec: normalize item text in 5.1
+af7fbfd2 vp8l_dec,ReadTransform: improve error status reporting
+7d8e0896 vp8l_dec: add VP8LSetError()
+a71ce1cf animencoder_fuzzer: fix error check w/Nallocfuzz
+e94b36d6 webp-lossless-bitstream-spec: relocate details from 5.1
+84628e56 webp-lossless-bitstream-spec: clarify image width changes
+ee722997 alpha_dec: add missing VP8SetError()
+0081693d enc_dec_fuzzer: use WebPDecode()
+0fcb311c enc_dec_fuzzer: fix WebPEncode/pic.error_code check
+982c177c webp-lossless-bitstream-spec: fix struct member refs
+56cf5625 webp-lossless-bitstream-spec: use RFC 7405 for ABNF
+6c6b3fd3 webp-lossless-bitstream-spec,cosmetics: delete blank lines
+29b9eb15 Merge changes Id56ca4fd,I662bd1d7 into main
+47c0af8d ReadHuffmanCodes: rm max_alphabet_size calc
+b92deba3 animencoder_fuzzer: no WebPAnimEncoderAssemble check w/nallocfuzz
+6be9bf8b animencoder_fuzzer: fix leak on alloc failure
+5c965e55 vp8l_dec,cosmetics: add some /*param=*/ comments
+e4fc2f78 webp-lossless-bitstream-spec: add validity note for max_symbol
+71916726 webp-lossless-bitstream-spec: fix max_symbol definition
+eac3bd5c Have the palette code be in its own file.
+e2c85878 Add an initializer for the SharpYuvOptions struct.
+4222b006 Merge tag 'v1.3.1'
+25d94f47 Implement more transfer functions in libsharpyuv
+2153a679 Merge changes Id0300937,I5dba5ccf,I57bb68e0,I2dba7b4e,I172aca36, ... into main
+4298e976 webp-lossless-bitstream-spec: add PredictorTransformOutput
+cd7e02be webp-lossless-bitstream-spec: fix RIFF-header ABNF
+6c3845f9 webp-lossless-bitstream-spec: split LZ77 Backward Ref section
+7f1b6799 webp-lossless-bitstream-spec: split Meta Prefix Codes section
+7b634d8f webp-lossless-bitstream-spec: note transform order
+6d6d4915 webp-lossless-bitstream-spec: update transformations text
+fd7bb21c update ChangeLog (tag: v1.3.1-rc2, tag: v1.3.1)
 e1adea50 update NEWS
+6b1c722a lossless_common.h,cosmetics: fix a typo
+08d60d60 webp-lossless-bitstream-spec: split code length section
+7a12afcc webp-lossless-bitstream-spec: rm unused anchor
 43393320 enc/*: normalize WebPEncodingSetError() calls
 287fdefe enc/*: add missing WebPEncodingSetError() calls
 c3bd7cff EncodeAlphaInternal: add missing error check
+14a9dbfb webp-lossless-bitstream-spec: refine single node text
+64819c7c Implement ExtractGreen_SSE2
 d49cfbb3 vp8l_enc,WriteImage: add missing error check
 2e5a9ec3 muxread,MuxImageParse: add missing error checks
 ebb6f949 cmake,emscripten: explicitly set stack size
 59a2b1f9 WebPDecodeYUV: check u/v/stride/uv_stride ptrs
 8e965ccb Call png_get_channels() to see if image has alpha
+fe80fbbd webp-container-spec: add some missing commas
+e8ed3176 Merge "treat FILTER_NONE as a regular Unfilter[] call" into main
+03a7a048 webp-lossless-bitstream-spec: rm redundant statement
+c437c7aa webp-lossless-bitstream-spec: mv up prefix code group def
+e4f17a31 webp-lossless-bitstream-spec: fix section reference
+e2ecd5e9 webp-lossless-bitstream-spec: clarify ABNF syntax
+8b55425a webp-lossless-bitstream-spec: refine pixel copy text
+29c9f2d4 webp-lossless-bitstream-spec: minor wording updates
+6b02f660 treat FILTER_NONE as a regular Unfilter[] call
+7f75c91c webp-container-spec: fix location of informative msg
+f6499943 webp-container-spec: consistently quote FourCCs
+49918af3 webp-container-spec: minor wording updates
 7f0a3419 update ChangeLog (tag: v1.3.1-rc1)
 bab7efbe update NEWS
 7138bf8f bump version to 1.3.1
 435b4ded update AUTHORS
 47351229 update .mailmap
+46bc4fc9 Merge "Switch ExtraCost to ints and implement it in SSE." into main
+828b4ce0 Switch ExtraCost to ints and implement it in SSE.
 ff6c7f4e CONTRIBUTING.md: add C style / cmake-format notes
 dd530437 add .cmake-format.py
 adbe2cb1 cmake,cosmetics: apply cmake-format
@ -1209,7 +1501,7 @@ b016cb91 NEON: faster fancy upsampling
 f04eb376 Merge tag 'v0.5.2'
 341d711c NEON: 5% faster conversion to RGB565 and RGBA4444
 abb54827 remove Clang warnings with unused arch arguments.
-ece9684f update ChangeLog (tag: v0.5.2-rc2, tag: v0.5.2, origin/0.5.2)
+ece9684f update ChangeLog (tag: v0.5.2-rc2, tag: v0.5.2)
 aa7744ca anim_util: quiet implicit conv warnings in 32-bit
 d9120271 jpegdec: correct ContextFill signature
 24eb3940 Remove some errors when compiling the code as C++.
@ -1496,7 +1788,7 @@ bbb6ecd9 Merge "Add MSA optimized distortion functions"
 c0991a14 io,EmitRescaledAlphaYUV: factor out a common expr
 48bf5ed1 build.gradle: remove tab
 bfef6c9f Merge tag 'v0.5.1'
-3d97bb75 update ChangeLog (tag: v0.5.1, origin/0.5.1)
+3d97bb75 update ChangeLog (tag: v0.5.1)
 deb54d91 Clarify the expected 'config' lifespan in WebPIDecode()
 435308e0 Add MSA optimized encoder transform functions
 dce64bfa Add MSA optimized alpha filter functions
--- a/Makefile.vc
+++ b/Makefile.vc
@ -12,6 +12,8 @@ LIBSHARPYUV_BASENAME = libsharpyuv
 ARCH = x86
 !ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
 ARCH = x64
+!ELSE IF ! [ cl 2>&1 | find "ARM64" > NUL ]
+ARCH = ARM64
 !ELSE IF ! [ cl 2>&1 | find "ARM" > NUL ]
 ARCH = ARM
 !ELSE
@ -30,7 +32,7 @@ PLATFORM_LDFLAGS = /SAFESEH
 NOLOGO     = /nologo
 CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Zi /D_DEBUG /RTC1
-CFLAGS     = /I. /Isrc $(NOLOGO) /W3 /EHsc /c
+CFLAGS     = /I. /Isrc $(NOLOGO) /MP /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST:EMBED /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
@ -229,6 +231,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
    $(DIROBJ)\dsp\lossless_sse41.obj \
+    $(DIROBJ)\dsp\lossless_avx2.obj \
    $(DIROBJ)\dsp\rescaler.obj \
    $(DIROBJ)\dsp\rescaler_mips32.obj \
    $(DIROBJ)\dsp\rescaler_mips_dsp_r2.obj \
@ -268,6 +271,7 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\lossless_enc_neon.obj \
    $(DIROBJ)\dsp\lossless_enc_sse2.obj \
    $(DIROBJ)\dsp\lossless_enc_sse41.obj \
+    $(DIROBJ)\dsp\lossless_enc_avx2.obj \
    $(DIROBJ)\dsp\ssim.obj \
    $(DIROBJ)\dsp\ssim_sse2.obj \

@ -321,6 +325,7 @@ ENC_OBJS = \
 EXTRAS_OBJS = \
    $(DIROBJ)\extras\extras.obj \
    $(DIROBJ)\extras\quality_estimate.obj \
+    $(DIROBJ)\extras\sharpyuv_risk_table.obj \

 IMAGEIO_UTIL_OBJS = \
    $(DIROBJ)\imageio\imageio_util.obj \
@ -336,6 +341,7 @@ UTILS_DEC_OBJS = \
    $(DIROBJ)\utils\color_cache_utils.obj \
    $(DIROBJ)\utils\filters_utils.obj \
    $(DIROBJ)\utils\huffman_utils.obj \
+    $(DIROBJ)\utils\palette.obj \
    $(DIROBJ)\utils\quant_levels_dec_utils.obj \
    $(DIROBJ)\utils\rescaler_utils.obj \
    $(DIROBJ)\utils\random_utils.obj \
@ -389,7 +395,7 @@ $(DIRBIN)\dwebp.exe: $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\dwebp.exe: $(LIBWEBPDEMUX)
 $(DIRBIN)\gif2webp.exe: $(DIROBJ)\examples\gif2webp.obj $(EX_GIF_DEC_OBJS)
 $(DIRBIN)\gif2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBPMUX)
-$(DIRBIN)\gif2webp.exe: $(LIBWEBP)
+$(DIRBIN)\gif2webp.exe: $(LIBWEBP) $(LIBSHARPYUV)
 $(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj $(EX_UTIL_OBJS)
 $(DIRBIN)\vwebp.exe: $(IMAGEIO_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\vwebp_sdl.exe: $(DIROBJ)\extras\vwebp_sdl.obj
--- a/41
+++ b/41
@ -1,3 +1,44 @@
+- 12/19/2024 version 1.5.0
+  This is a binary compatible release.
+  API changes:
+    - `cross_color_transform_bits` added to WebPAuxStats
+  * minor lossless encoder speed and compression improvements
+  * lossless encoding does not use floats anymore
+  * additional Arm optimizations for lossy & lossless + general code generation
+    improvements
+  * improvements to WASM performance (#643)
+  * improvements and corrections in webp-container-spec.txt and
+    webp-lossless-bitstream-spec.txt (#646, #355607636)
+  * further security related hardening and increased fuzzing coverage w/fuzztest
+    (oss-fuzz: #382816119, #70112, #70102, #69873, #69825, #69508, #69208)
+  * miscellaneous warning, bug & build fixes (#499, #562, #381372617,
+    #381109771, #42340561, #375011696, #372109644, chromium: #334120888)
+  Tool updates:
+    * gif2webp: add -sharp_yuv & -near_lossless
+    * img2webp: add -exact & -noexact
+    * exit codes normalized; running an example program with no
+      arguments will output its help and exit with an error (#42340557,
+      #381372617)
+
+- 4/12/2024: version 1.4.0
+  This is a binary compatible release.
+  * API changes:
+    - libwebpmux: WebPAnimEncoderSetChunk, WebPAnimEncoderGetChunk,
+                  WebPAnimEncoderDeleteChunk
+    - libsharpyuv: SharpYuvOptionsInit, SharpYuvConvertWithOptions
+    - extras: SharpYuvEstimate420Risk
+  * further security related hardening in libwebp & examples
+  * some minor optimizations in the lossless encoder
+  * added WEBP_NODISCARD to report unused result warnings; enable with
+    -DWEBP_ENABLE_NODISCARD=1
+  * improvements and corrections in webp-container-spec.txt and
+    webp-lossless-bitstream-spec.txt (#611)
+  * miscellaneous warning, bug & build fixes (#615, #619, #632, #635)
+
+- 9/13/2023: version 1.3.2
+  This is a binary compatible release.
+  * security fix for lossless decoder (chromium: #1479274, CVE-2023-4863)
+
 - 6/23/2023: version 1.3.1
  This is a binary compatible release.
  * security fixes for lossless encoder (#603, chromium: #1420107, #1455619,
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
      \__\__/\____/\_____/__/ ____  ___
            / _/ /    \    \ /  _ \/ _/
           /  \_/   / /   \ \   __/  \__
-           \____/____/\_____/_____/____/v1.3.1
+           \____/____/\_____/_____/____/v1.5.0
 ```

 WebP codec is a library to encode and decode images in WebP format. This package
@ -42,7 +42,8 @@ See the [APIs documentation](doc/api.md), and API usage examples in the

 ## Bugs

-Please report all bugs to the issue tracker: https://bugs.chromium.org/p/webp
+Please report all bugs to the [issue tracker](https://issues.webmproject.org).
+For security reports, select 'Security report' from the Template dropdown.

 Patches welcome! See [how to contribute](CONTRIBUTING.md).

--- a/build.gradle
+++ b/build.gradle
@ -173,6 +173,7 @@ model {
            include "color_cache_utils.c"
            include "filters_utils.c"
            include "huffman_utils.c"
+            include "palette.c"
            include "quant_levels_dec_utils.c"
            include "random_utils.c"
            include "rescaler_utils.c"
--- a/cmake/WebPConfig.cmake.in
+++ b/cmake/WebPConfig.cmake.in
@ -11,7 +11,8 @@ endif()
 include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")

 set_and_check(WebP_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
-set(WEBP_INCLUDE_DIRS ${WebP_INCLUDE_DIRS})
+set(WebP_INCLUDE_DIRS ${WebP_INCLUDE_DIR})
+set(WEBP_INCLUDE_DIRS ${WebP_INCLUDE_DIR})
 set(WebP_LIBRARIES "@INSTALLED_LIBRARIES@")
 set(WEBP_LIBRARIES "${WebP_LIBRARIES}")

--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -94,6 +94,9 @@
 /* Set to 1 if SSE4.1 is supported */
 #cmakedefine WEBP_HAVE_SSE41 1

+/* Set to 1 if AVX2 is supported */
+#cmakedefine WEBP_HAVE_AVX2 1
+
 /* Set to 1 if TIFF library is installed */
 #cmakedefine WEBP_HAVE_TIFF 1

--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@ -38,9 +38,9 @@ function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
 endfunction()

 # those are included in the names of WEBP_USE_* in c++ code.
-set(WEBP_SIMD_FLAGS "SSE41;SSE2;MIPS32;MIPS_DSP_R2;NEON;MSA")
+set(WEBP_SIMD_FLAGS "AVX2;SSE41;SSE2;MIPS32;MIPS_DSP_R2;NEON;MSA")
 set(WEBP_SIMD_FILE_EXTENSIONS
-    "_sse41.c;_sse2.c;_mips32.c;_mips_dsp_r2.c;_neon.c;_msa.c")
+    "_avx2.c;_sse41.c;_sse2.c;_mips32.c;_mips_dsp_r2.c;_neon.c;_msa.c")
 if(MSVC AND CMAKE_C_COMPILER_ID STREQUAL "MSVC")
  # With at least Visual Studio 12 (2013)+ /arch is not necessary to build SSE2
  # or SSE4 code unless a lesser /arch is forced. MSVC does not have a SSE4
@ -50,18 +50,18 @@ if(MSVC AND CMAKE_C_COMPILER_ID STREQUAL "MSVC")
  if(MSVC_VERSION GREATER_EQUAL 1800 AND NOT CMAKE_C_FLAGS MATCHES "/arch:")
    set(SIMD_ENABLE_FLAGS)
  else()
-    set(SIMD_ENABLE_FLAGS "/arch:AVX;/arch:SSE2;;;;")
+    set(SIMD_ENABLE_FLAGS "/arch:AVX2;/arch:AVX;/arch:SSE2;;;;")
  endif()
  set(SIMD_DISABLE_FLAGS)
 else()
-  set(SIMD_ENABLE_FLAGS "-msse4.1;-msse2;-mips32;-mdspr2;-mfpu=neon;-mmsa")
-  set(SIMD_DISABLE_FLAGS "-mno-sse4.1;-mno-sse2;;-mno-dspr2;;-mno-msa")
+  set(SIMD_ENABLE_FLAGS "-mavx2;-msse4.1;-msse2;-mips32;-mdspr2;-mfpu=neon;-mmsa")
+  set(SIMD_DISABLE_FLAGS "-mno-avx2;-mno-sse4.1;-mno-sse2;;-mno-dspr2;;-mno-msa")
 endif()

 set(WEBP_SIMD_FILES_TO_INCLUDE)
 set(WEBP_SIMD_FLAGS_TO_INCLUDE)

-if(${ANDROID})
+if(ANDROID AND ANDROID_ABI)
  if(${ANDROID_ABI} STREQUAL "armeabi-v7a")
    # This is because Android studio uses the configuration "-march=armv7-a
    # -mfloat-abi=softfp -mfpu=vfpv3-d16" that does not trigger neon
@ -106,8 +106,9 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
  endif()
  # Check which files we should include or not.
  list(GET WEBP_SIMD_FILE_EXTENSIONS ${I_SIMD} WEBP_SIMD_FILE_EXTENSION)
-  file(GLOB SIMD_FILES "${CMAKE_CURRENT_LIST_DIR}/../"
-       "src/dsp/*${WEBP_SIMD_FILE_EXTENSION}")
+  file(GLOB SIMD_FILES
+       "${CMAKE_CURRENT_LIST_DIR}/../sharpyuv/*${WEBP_SIMD_FILE_EXTENSION}"
+       "${CMAKE_CURRENT_LIST_DIR}/../src/dsp/*${WEBP_SIMD_FILE_EXTENSION}")
  if(WEBP_HAVE_${WEBP_SIMD_FLAG})
    # Memorize the file and flags.
    foreach(FILE ${SIMD_FILES})
--- a/configure.ac
+++ b/configure.ac
@ -1,5 +1,5 @@
-AC_INIT([libwebp], [1.3.1],
-        [https://bugs.chromium.org/p/webp],,
+AC_INIT([libwebp], [1.5.0],
+        [https://issues.webmproject.org],,
        [https://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
 AC_PREREQ([2.60])
@ -106,6 +106,7 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wconstant-conversion])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra-semi-stmt])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wfloat-conversion])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-nonliteral])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-security])
@ -115,6 +116,7 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wparentheses-equality])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wstrict-prototypes])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wundef])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code-aggressive])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
@ -159,6 +161,25 @@ AS_IF([test "$GCC" = "yes" ], [
 AC_SUBST([AM_CFLAGS])

 dnl === Check for machine specific flags
+AC_ARG_ENABLE([avx2],
+              AS_HELP_STRING([--disable-avx2],
+                             [Disable detection of AVX2 support
+                              @<:@default=auto@:>@]))
+
+AS_IF([test "x$enable_avx2" != "xno" -a "x$enable_sse4_1" != "xno"
+      -a "x$enable_sse2" != "xno"], [
+  AVX2_FLAGS="$INTRINSICS_CFLAGS $AVX2_FLAGS"
+  TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
+  AS_IF([test -n "$AVX2_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $AVX2_FLAGS"
+    AC_CHECK_HEADER([immintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_AVX2, [1],
+                     [Set to 1 if AVX2 is supported])],
+                    [AVX2_FLAGS=""])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([AVX2_FLAGS])])
+
 AC_ARG_ENABLE([sse4.1],
              AS_HELP_STRING([--disable-sse4.1],
                             [Disable detection of SSE4.1 support
@ -464,7 +485,7 @@ AC_ARG_ENABLE([sdl],
                              @<:@default=auto@:>@]))
 AS_IF([test "x$enable_sdl" != "xno"], [
  CLEAR_LIBVARS([SDL])
-  AC_PATH_PROGS([LIBSDL_CONFIG], [sdl-config])
+  AC_PATH_PROGS([LIBSDL_CONFIG], [sdl2-config])
  if test -n "$LIBSDL_CONFIG"; then
    SDL_INCLUDES=`$LIBSDL_CONFIG --cflags`
    SDL_LIBS="`$LIBSDL_CONFIG --libs`"
@ -474,13 +495,12 @@ AS_IF([test "x$enable_sdl" != "xno"], [

  sdl_header="no"
  LIBCHECK_PROLOGUE([SDL])
-  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL/SDL.h"],
-                  [AC_CHECK_HEADER([SDL.h], [sdl_header="SDL.h"],
-                  [AC_MSG_WARN(SDL library not available - no sdl.h)])])
+  AC_CHECK_HEADER([SDL2/SDL.h], [sdl_header="SDL2/SDL.h"],
+                  [AC_MSG_WARN(SDL2 library not available - no SDL.h)])
  if test x"$sdl_header" != "xno"; then
    AC_LANG_PUSH(C)
    SDL_SAVED_LIBS="$LIBS"
-    for lib in "" "-lSDL" "-lSDLmain -lSDL"; do
+    for lib in "" "-lSDL2" "-lSDL2main -lSDL2"; do
      LIBS="$SDL_SAVED_LIBS $lib"
      # Perform a full link to ensure SDL_main is resolved if needed.
      AC_LINK_IFELSE(
@ -762,7 +782,8 @@ AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 src/libwebp.pc src/libwebpdecoder.pc \
                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc])

-
+dnl fix exports from MinGW builds
+AC_CONFIG_COMMANDS_POST([$SED -i 's/-DDLL_EXPORT/-DWEBP_DLL/' config.status])
 AC_OUTPUT

 AC_MSG_NOTICE([
--- a/doc/api.md
+++ b/doc/api.md
@ -157,7 +157,7 @@ decoding is not finished yet or VP8_STATUS_OK when decoding is done. Any other
 status is an error condition.

 The 'idec' object must always be released (even upon an error condition) by
-calling: WebPDelete(idec).
+calling: WebPIDelete(idec).

 To retrieve partially decoded picture samples, one must use the corresponding
 method: WebPIDecGetRGB or WebPIDecGetYUVA. It will return the last displayable
--- a/doc/building.md
+++ b/doc/building.md
@ -96,6 +96,24 @@ make
 make install
 ```

+## Building libwebp - Using vcpkg
+
+You can download and install libwebp using the
+[vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
+
+```shell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh
+./vcpkg integrate install
+./vcpkg install libwebp
+```
+
+The libwebp port in vcpkg is kept up to date by Microsoft team members and
+community contributors. If the version is out of date, please
+[create an issue or pull request](https://github.com/Microsoft/vcpkg) on the
+vcpkg repository.
+
 ## CMake

 With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
@ -210,4 +228,4 @@ generated code, but is untested.
 ## Javascript decoder

 Libwebp can be compiled into a JavaScript decoder using Emscripten and CMake.
-See the [corresponding documentation](../README.md)
+See the [corresponding documentation](../webp_js/README.md)
--- a/doc/specs_generation.md
+++ b/doc/specs_generation.md
@ -17,10 +17,11 @@ rubygems will install automatically. The following will apply inline CSS
 styling; an external stylesheet is not needed.

 ```shell
-$ kramdown doc/webp-lossless-bitstream-spec.txt --template \
-  doc/template.html --coderay-css style --coderay-line-numbers ' ' \
-  --coderay-default-lang c > \
-  doc/output/webp-lossless-bitstream-spec.html
+$ kramdown doc/webp-lossless-bitstream-spec.txt \
+  --template doc/template.html \
+  -x syntax-coderay --syntax-highlighter coderay \
+  --syntax-highlighter-opts "{default_lang: c, line_numbers: , css: style}" \
+  > doc/output/webp-lossless-bitstream-spec.html
 ```

 Optimally, use kramdown 0.13.7 or newer if syntax highlighting desired.
--- a/doc/tools.md
+++ b/doc/tools.md
@ -321,10 +321,13 @@ Per-frame options (only used for subsequent images input):

 ```
 -d <int> ............. frame duration in ms (default: 100)
-lossless  ........... use lossless mode (default)
-lossy ... ........... use lossy mode
+-lossless ............ use lossless mode (default)
+-lossy ............... use lossy mode
 -q <float> ........... quality
-m <int> ............. method to use
+-m <int> ............. compression method (0=fast, 6=slowest), default=4
+-exact, -noexact ..... preserve or alter RGB values in transparent area
+                       (default: -noexact, may cause artifacts
+                                 with lossy animations)
 ```

 example: `img2webp -loop 2 in0.png -lossy in1.jpg -d 80 in2.tiff -o out.webp`
@ -351,8 +354,12 @@ Options:
 -lossy ................. encode image using lossy compression
 -mixed ................. for each frame in the image, pick lossy
                         or lossless compression heuristically
+-near_lossless <int> ... use near-lossless image preprocessing
+                         (0..100=off), default=100
+-sharp_yuv ............. use sharper (and slower) RGB->YUV conversion
+                         (lossy only)
 -q <float> ............. quality factor (0:small..100:big)
-m <int> ............... compression method (0=fast, 6=slowest)
+-m <int> ............... compression method (0=fast, 6=slowest), default=4
 -min_size .............. minimize output size (default:off)
                         lossless compression by default; can be
                         combined with -q, -m, -lossy or -mixed
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -21,9 +21,9 @@ Introduction
 ------------

 WebP is an image format that uses either (i) the VP8 key frame encoding to
-compress image data in a lossy way, or (ii) the WebP lossless encoding. These
-encoding schemes should make it more efficient than older formats such as JPEG,
-GIF and PNG. It is optimized for fast image transfer over the network (for
+compress image data in a lossy way or (ii) the WebP lossless encoding. These
+encoding schemes should make it more efficient than older formats, such as JPEG,
+GIF, and PNG. It is optimized for fast image transfer over the network (for
 example, for websites). The WebP format has feature parity (color profile,
 metadata, animation, etc.) with other formats as well. This document describes
 the structure of a WebP file.
@ -31,36 +31,37 @@ the structure of a WebP file.
 The WebP container (that is, the RIFF container for WebP) allows feature support
 over and above the basic use case of WebP (that is, a file containing a single
 image encoded as a VP8 key frame). The WebP container provides additional
-support for:
+support for the following:

-  * **Lossless compression.** An image can be losslessly compressed, using the
+  * Lossless Compression: An image can be losslessly compressed, using the
    WebP Lossless Format.

-  * **Metadata.** An image may have metadata stored in Exif or XMP formats.
+  * Metadata: An image may have metadata stored in Exchangeable Image File
+    Format (Exif) or Extensible Metadata Platform (XMP) format.

-  * **Transparency.** An image may have transparency, that is, an alpha channel.
+  * Transparency: An image may have transparency, that is, an alpha channel.

-  * **Color Profile.** An image may have an embedded ICC profile as described
+  * Color Profile: An image may have an embedded ICC profile as described
    by the [International Color Consortium][iccspec].

-  * **Animation.** An image may have multiple frames with pauses between them,
+  * Animation: An image may have multiple frames with pauses between them,
    making it an animation.

+Terminology & Basics
+--------------------
+
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
 "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this
 document are to be interpreted as described in BCP 14 [RFC 2119][] [RFC 8174][]
 when, and only when, they appear in all capitals, as shown here.

-Bit numbering in chunk diagrams starts at `0` for the most significant bit
-('MSB 0') as described in [RFC 1166][].
-
-Terminology & Basics
--------------------
-
 A WebP file contains either a still image (that is, an encoded matrix of pixels)
 or an [animation](#animation). Optionally, it can also contain transparency
-information, color profile and metadata. We refer to the matrix of pixels as the
-_canvas_ of the image.
+information, a color profile and metadata. We refer to the matrix of pixels as
+the _canvas_ of the image.
+
+Bit numbering in chunk diagrams starts at `0` for the most significant bit
+('MSB 0'), as described in [RFC 1166][].

 Below are additional terms used throughout this document:

@ -83,7 +84,7 @@ _uint32_

 _FourCC_

-: A _FourCC_ (four-character code) is a _uint32_ created by concatenating four
+: A four-character code (FourCC) is a _uint32_ created by concatenating four
  ASCII characters in little-endian order. This means 'aaaa' (0x61616161) and
 'AAAA' (0x41414141) are treated as different _FourCCs_.

@ -94,9 +95,8 @@ _1-based_

 _ChunkHeader('ABCD')_

-: This is used to describe the _FourCC_ and _Chunk Size_ header of individual
-  chunks, where 'ABCD' is the FourCC for the chunk. This element's size is 8
-  bytes.
+: Used to describe the _FourCC_ and _Chunk Size_ header of individual chunks,
+  where 'ABCD' is the FourCC for the chunk. This element's size is 8 bytes.


 RIFF File Format
@ -124,14 +124,14 @@ Chunk FourCC: 32 bits
 Chunk Size: 32 bits (_uint32_)

 : The size of the chunk in bytes, not including this field, the chunk
-  identifier or padding.
+  identifier, or padding.

 Chunk Payload: _Chunk Size_ bytes

-: The data payload. If _Chunk Size_ is odd, a single padding byte -- that MUST
+: The data payload. If _Chunk Size_ is odd, a single padding byte -- which MUST
  be `0` to conform with RIFF -- is added.

-**Note:** RIFF has a convention that all-uppercase chunk FourCCs are standard
+**Note**: RIFF has a convention that all-uppercase chunk FourCCs are standard
 chunks that apply to any RIFF file format, while FourCCs specific to a file
 format are all lowercase. WebP does not follow this convention.

@ -151,17 +151,17 @@ WebP File Header

 'RIFF': 32 bits

-: The ASCII characters 'R' 'I' 'F' 'F'.
+: The ASCII characters 'R', 'I', 'F', 'F'.

 File Size: 32 bits (_uint32_)

-: The size of the file in bytes starting at offset 8. The maximum value of
+: The size of the file in bytes, starting at offset 8. The maximum value of
  this field is 2^32 minus 10 bytes and thus the size of the whole file is at
-  most 4GiB minus 2 bytes.
+  most 4 GiB minus 2 bytes.

 'WEBP': 32 bits

-: The ASCII characters 'W' 'E' 'B' 'P'.
+: The ASCII characters 'W', 'E', 'B', 'P'.

 A WebP file MUST begin with a RIFF header with the FourCC 'WEBP'. The file size
 in the header is the total size of the chunks that follow plus `4` bytes for
@ -188,10 +188,10 @@ Simple WebP (lossy) file format:
    |                    WebP file header (12 bytes)                |
    |                                                               |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    :                          VP8 chunk                            :
+    :                        'VP8 ' Chunk                           :
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

-VP8 chunk:
+'VP8 ' Chunk:

     0                   1                   2                   3
     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@ -206,21 +206,21 @@ VP8 data: _Chunk Size_ bytes

 : VP8 bitstream data.

-Note the fourth character in the 'VP8 ' FourCC is an ASCII space (0x20).
+Note that the fourth character in the 'VP8 ' FourCC is an ASCII space (0x20).

-The VP8 bitstream format specification can be found at [VP8 Data Format and
-Decoding Guide][vp8spec]. Note that the VP8 frame header contains the VP8 frame
+The VP8 bitstream format specification is described in [VP8 Data Format and
+Decoding Guide][rfc 6386]. Note that the VP8 frame header contains the VP8 frame
 width and height. That is assumed to be the width and height of the canvas.

 The VP8 specification describes how to decode the image into Y'CbCr format. To
-convert to RGB, Rec. 601 SHOULD be used. Applications MAY use another
-conversion method, but visual results may differ among decoders.
+convert to RGB, [Recommendation BT.601][rec601] SHOULD be used. Applications MAY
+use another conversion method, but visual results may differ among decoders.


 Simple File Format (Lossless)
 -----------------------------

-**Note:** Older readers may not support files using the lossless format.
+**Note**: Older readers may not support files using the lossless format.

 This layout SHOULD be used if the image requires _lossless_ encoding (with an
 optional transparency channel) and does not require advanced features provided
@ -235,10 +235,10 @@ Simple WebP (lossless) file format:
    |                    WebP file header (12 bytes)                |
    |                                                               |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    :                          VP8L chunk                           :
+    :                         'VP8L' Chunk                          :
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

-VP8L chunk:
+'VP8L' Chunk:

     0                   1                   2                   3
     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@ -262,21 +262,21 @@ and height of the canvas.
 Extended File Format
 --------------------

-**Note:** Older readers may not support files using the extended format.
+**Note**: Older readers may not support files using the extended format.

 An extended format file consists of:

-  * A 'VP8X' chunk with information about features used in the file.
+  * A 'VP8X' Chunk with information about features used in the file.

-  * An optional 'ICCP' chunk with color profile.
+  * An optional 'ICCP' Chunk with a color profile.

-  * An optional 'ANIM' chunk with animation control data.
+  * An optional 'ANIM' Chunk with animation control data.

  * Image data.

-  * An optional 'EXIF' chunk with Exif metadata.
+  * An optional 'EXIF' Chunk with Exif metadata.

-  * An optional 'XMP ' chunk with XMP metadata.
+  * An optional 'XMP ' Chunk with XMP metadata.

  * An optional list of [unknown chunks](#unknown-chunks).

@ -290,15 +290,18 @@ up of:
 For an _animated image_, the _image data_ consists of multiple frames. More
 details about frames can be found in the [Animation](#animation) section.

-All chunks SHOULD be placed in the same order as listed above. If a chunk
-appears in the wrong place, the file is invalid, but readers MAY parse the
-file, ignoring the chunks that are out of order.
+All chunks necessary for reconstruction and color correction, that is, 'VP8X',
+'ICCP', 'ANIM', 'ANMF', 'ALPH', 'VP8 ', and 'VP8L', MUST appear in the order
+described earlier. Readers SHOULD fail when chunks necessary for reconstruction
+and color correction are out of order.

-**Rationale:** Setting the order of chunks should allow quicker file
-parsing. For example, if an 'ALPH' chunk does not appear in its required
-position, a decoder can choose to stop searching for it. The rule of
-ignoring late chunks should make programs that need to do a full search
-give the same results as the ones stopping early.
+[Metadata](#metadata) and [unknown chunks](#unknown-chunks) MAY appear out of
+order.
+
+**Rationale:** The chunks necessary for reconstruction should appear first in
+the file to allow a reader to begin decoding an image before receiving all of
+the data. An application may benefit from varying the order of metadata and
+custom chunks to suit the implementation.

 Extended WebP file header:
 {:#extended_header}
@ -326,7 +329,7 @@ Reserved (Rsv): 2 bits

 ICC profile (I): 1 bit

-: Set if the file contains an ICC profile.
+: Set if the file contains an 'ICCP' Chunk.

 Alpha (L): 1 bit

@ -343,7 +346,7 @@ XMP metadata (X): 1 bit

 Animation (A): 1 bit

-: Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
+: Set if this is an animated image. Data in 'ANIM' and 'ANMF' Chunks should be
  used to control the animation.

 Reserved (R): 1 bit
@ -372,9 +375,9 @@ Future specifications may add more fields. Unknown fields MUST be ignored.

 #### Animation

-An animation is controlled by ANIM and ANMF chunks.
+An animation is controlled by 'ANIM' and 'ANMF' Chunks.

-ANIM Chunk:
+'ANIM' Chunk:
 {:#anim_chunk}

 For an animated image, this chunk contains the _global parameters_ of the
@ -396,14 +399,14 @@ Background Color: 32 bits (_uint32_)
 : The default background color of the canvas in \[Blue, Green, Red, Alpha\]
  byte order. This color MAY be used to fill the unused space on the canvas
  around the frames, as well as the transparent pixels of the first frame.
-  Background color is also used when disposal method is `1`.
+  The background color is also used when the Disposal method is `1`.

-**Note**:
+**Notes**:

-  * Background color MAY contain a non-opaque alpha value, even if the _Alpha_
-    flag in [VP8X chunk](#extended_header) is unset.
+  * The background color MAY contain a non-opaque alpha value, even if the
+    _Alpha_ flag in the ['VP8X' Chunk](#extended_header) is unset.

-  * Viewer applications SHOULD treat the background color value as a hint, and
+  * Viewer applications SHOULD treat the background color value as a hint and
    are not required to use it.

  * The canvas is cleared at the start of each loop. The background color MAY be
@ -411,13 +414,14 @@ Background Color: 32 bits (_uint32_)

 Loop Count: 16 bits (_uint16_)

-: The number of times to loop the animation. `0` means infinitely.
+: The number of times to loop the animation. If it is `0`, this means
+  infinitely.

-This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
+This chunk MUST appear if the _Animation_ flag in the 'VP8X' Chunk is set.
 If the _Animation_ flag is not set and this chunk is present, it MUST be
 ignored.

-ANMF chunk:
+'ANMF' Chunk:

 For animated images, this chunk contains information about a _single_ frame.
 If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.
@ -459,10 +463,10 @@ Frame Height Minus One: 24 bits (_uint24_)

 Frame Duration: 24 bits (_uint24_)

-: The time to wait before displaying the next frame, in 1 millisecond units.
-  Note the interpretation of frame duration of 0 (and often <= 10) is
-  implementation defined. Many tools and browsers assign a minimum duration
-  similar to GIF.
+: The time to wait before displaying the next frame, in 1-millisecond units.
+  Note that the interpretation of the Frame Duration of 0 (and often <= 10) is
+  defined by the implementation. Many tools and browsers assign a minimum
+  duration similar to GIF.

 Reserved: 6 bits

@ -473,10 +477,10 @@ Blending method (B): 1 bit
 : Indicates how transparent pixels of _the current frame_ are to be blended
  with corresponding pixels of the previous canvas:

-    * `0`: Use alpha blending. After disposing of the previous frame, render the
+    * `0`: Use alpha-blending. After disposing of the previous frame, render the
      current frame on the canvas using [alpha-blending](#alpha-blending). If
-      the current frame does not have an alpha channel, assume alpha value of
-      255, effectively replacing the rectangle.
+      the current frame does not have an alpha channel, assume the alpha value
+      is 255, effectively replacing the rectangle.

    * `1`: Do not blend. After disposing of the previous frame, render the
      current frame on the canvas by overwriting the rectangle covered by the
@ -489,20 +493,20 @@ Disposal method (D): 1 bit

    * `0`: Do not dispose. Leave the canvas as is.

-    * `1`: Dispose to background color. Fill the _rectangle_ on the canvas
-      covered by the _current frame_ with background color specified in the
-      [ANIM chunk](#anim_chunk).
+    * `1`: Dispose to the background color. Fill the _rectangle_ on the canvas
+      covered by the _current frame_ with the background color specified in the
+      ['ANIM' Chunk](#anim_chunk).

 **Notes**:

  * The frame disposal only applies to the _frame rectangle_, that is, the
-    rectangle defined by _Frame X_, _Frame Y_, _frame width_ and _frame height_.
-    It may or may not cover the whole canvas.
+    rectangle defined by _Frame X_, _Frame Y_, _frame width_, and _frame
+    height_. It may or may not cover the whole canvas.

 {:#alpha-blending}
-  * **Alpha-blending**:
+  * Alpha-blending:

-    Given that each of the R, G, B and A channels is 8-bit, and the RGB
+    Given that each of the R, G, B, and A channels is 8 bits, and the RGB
    channels are _not premultiplied_ by alpha, the formula for blending
    'dst' onto 'src' is:

@ -518,10 +522,10 @@ Disposal method (D): 1 bit

  * Alpha-blending SHOULD be done in linear color space, by taking into account
    the [color profile](#color-profile) of the image. If the color profile is
-    not present, sRGB is to be assumed. (Note that sRGB also needs to be
-    linearized due to a gamma of ~2.2).
+    not present, standard RGB (sRGB) is to be assumed. (Note that sRGB also
+    needs to be linearized due to a gamma of ~2.2.)

-Frame Data: _Chunk Size_ - `16` bytes
+Frame Data: _Chunk Size_ bytes - `16`

 : Consists of:

@ -531,8 +535,8 @@ Frame Data: _Chunk Size_ - `16` bytes

  * An optional list of [unknown chunks](#unknown-chunks).

-**Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
-_padded_ chunks as described by the [RIFF file format](#riff-file-format).
+**Note**: The 'ANMF' payload, _Frame Data_, consists of individual
+_padded_ chunks, as described by the [RIFF file format](#riff-file-format).

 #### Alpha

@ -549,18 +553,20 @@ Reserved (Rsv): 2 bits

 : MUST be `0`. Readers MUST ignore this field.

-Pre-processing (P): 2 bits
+Preprocessing (P): 2 bits

-: These _informative_ bits are used to signal the pre-processing that has
+: These _informative_ bits are used to signal the preprocessing that has
  been performed during compression. The decoder can use this information to
  for example, dither the values or smooth the gradients prior to display.

-    * `0`: No pre-processing.
+    * `0`: No preprocessing.
    * `1`: Level reduction.

+Decoders are not required to use this information in any specified way.
+
 Filtering method (F): 2 bits

-: The filtering method used:
+: The filtering methods used are described as follows:

    * `0`: None.
    * `1`: Horizontal filter.
@ -584,8 +590,8 @@ made depending on the filtering method:

 where `clip(v)` is equal to:

-  * 0    if v < 0
-  * 255  if v > 255
+  * 0    if v < 0,
+  * 255  if v > 255, or
  * v    otherwise

 The final value is derived by adding the decompressed value `X` to the
@ -594,17 +600,15 @@ into the \[0..255\] one:

 `alpha = (predictor + X) % 256`

-There are special cases for the left-most and top-most pixel positions:
+There are special cases for the left-most and top-most pixel positions. For
+example, the top-left value at location (0, 0) uses 0 as the predictor value.
+Otherwise:

-  * The top-left value at location (0, 0) uses 0 as predictor value. Otherwise,
  * For horizontal or gradient filtering methods, the left-most pixels at
    location (0, y) are predicted using the location (0, y-1) just above.
  * For vertical or gradient filtering methods, the top-most pixels at
    location (x, 0) are predicted using the location (x-1, 0) on the left.

-
-Decoders are not required to use this information in any specified way.
-
 Compression method (C): 2 bits

 : The compression method used:
@ -612,37 +616,37 @@ Compression method (C): 2 bits
    * `0`: No compression.
    * `1`: Compressed using the WebP lossless format.

-Alpha bitstream: _Chunk Size_ - `1` bytes
+Alpha bitstream: _Chunk Size_ bytes - `1`

 : Encoded alpha bitstream.

 This optional chunk contains encoded alpha data for this frame. A frame
-containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+containing a 'VP8L' Chunk SHOULD NOT contain this chunk.

 **Rationale**: The transparency information is already part of the 'VP8L'
-chunk.
+Chunk.

-The alpha channel data is stored as uncompressed raw data (when
+The alpha channel data is stored as uncompressed raw data (when the
 compression method is '0') or compressed using the lossless format
 (when the compression method is '1').

-  * Raw data: consists of a byte sequence of length width * height,
+  * Raw data: This consists of a byte sequence of length = width * height,
    containing all the 8-bit transparency values in scan order.

-  * Lossless format compression: the byte sequence is a compressed
-    image-stream (as described in the [WebP Lossless Bitstream Format]
-    [webpllspec]) of implicit dimension width x height. That is, this
-    image-stream does NOT contain any headers describing the image dimension.
+  * Lossless format compression: The byte sequence is a compressed
+    image-stream (as described in ["WebP Lossless Bitstream Format"]
+    [webpllspec]) of implicit dimensions width x height. That is, this
+    image-stream does NOT contain any headers describing the image dimensions.

-    **Rationale**: the dimension is already known from other sources,
-    so storing it again would be redundant and error-prone.
+    **Rationale**: The dimensions are already known from other sources,
+    so storing them again would be redundant and prone to error.

-    Once the image-stream is decoded into ARGB color values, following
-    the process described in the lossless format specification, the
-    transparency information must be extracted from the *green* channel
-    of the ARGB quadruplet.
+    Once the image-stream is decoded into Alpha, Red, Green, Blue (ARGB) color
+    values, following the process described in the lossless format
+    specification, the transparency information must be extracted from the
+    *green* channel of the ARGB quadruplet.

-    **Rationale**: the green channel is allowed extra transformation
+    **Rationale**: The green channel is allowed extra transformation
    steps in the specification -- unlike the other channels -- that can
    improve compression.

@ -650,13 +654,13 @@ compression method is '0') or compressed using the lossless format

 This chunk contains compressed bitstream data for a single frame.

-A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
-significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
-"VP8L" as its tag.
+A bitstream chunk may be either (i) a 'VP8 ' Chunk, using 'VP8 ' (note the
+significant fourth-character space) as its FourCC, _or_ (ii) a 'VP8L' Chunk,
+using 'VP8L' as its FourCC.

-The formats of VP8 and VP8L chunks are as described in sections
+The formats of 'VP8 ' and 'VP8L' Chunks are as described in sections
 [Simple File Format (Lossy)](#simple-file-format-lossy)
-and [Simple File Format (Lossless)](#simple-file-format-lossless) respectively.
+and [Simple File Format (Lossless)](#simple-file-format-lossless), respectively.

 #### Color Profile

@ -683,14 +687,14 @@ If this chunk is not present, sRGB SHOULD be assumed.

 #### Metadata

-Metadata can be stored in 'EXIF' or 'XMP ' chunks.
+Metadata can be stored in 'EXIF' or 'XMP ' Chunks.

 There SHOULD be at most one chunk of each type ('EXIF' and 'XMP '). If there
 are more such chunks, readers MAY ignore all except the first one.

 The chunks are defined as follows:

-EXIF chunk:
+'EXIF' Chunk:

     0                   1                   2                   3
     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@ -705,7 +709,7 @@ Exif Metadata: _Chunk Size_ bytes

 : Image metadata in Exif format.

-XMP chunk:
+'XMP ' Chunk:

     0                   1                   2                   3
     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@ -720,72 +724,73 @@ XMP Metadata: _Chunk Size_ bytes

 : Image metadata in XMP format.

-Note the fourth character in the 'XMP ' FourCC is an ASCII space (0x20).
+Note that the fourth character in the 'XMP ' FourCC is an ASCII space (0x20).

 Additional guidance about handling metadata can be found in the
-Metadata Working Group's [Guidelines for Handling Metadata][metadata].
+Metadata Working Group's ["Guidelines for Handling Metadata"][metadata].

 #### Unknown Chunks

-A RIFF chunk (described in [this](#terminology-amp-basics) section) whose _chunk
-tag_ is different from any of the chunks described in this document, is
+A RIFF chunk (described in the [RIFF File Format](#riff-file-format) section)
+whose FourCC is different from any of the chunks described in this document, is
 considered an _unknown chunk_.

 **Rationale**: Allowing unknown chunks gives a provision for future extension
-of the format, and also allows storage of any application-specific data.
+of the format and also allows storage of any application-specific data.

 A file MAY contain unknown chunks:

-  * At the end of the file as described in [Extended WebP file
-    header](#extended_header) section.
-  * At the end of ANMF chunks as described in the
+  * at the end of the file, as described in [Extended WebP file
+    header](#extended_header) section, or
+  * at the end of 'ANMF' Chunks, as described in the
    [Animation](#animation) section.

 Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
 original order (unless they specifically intend to modify these chunks).

-### Assembling the Canvas From Frames
+### Canvas Assembly from Frames

 Here we provide an overview of how a reader MUST assemble a canvas in the case
 of an animated image.

 The process begins with creating a canvas using the dimensions given in the
-'VP8X' chunk, `Canvas Width Minus One + 1` pixels wide by `Canvas Height Minus
-One + 1` pixels high. The `Loop Count` field from the 'ANIM' chunk controls how
+'VP8X' Chunk, `Canvas Width Minus One + 1` pixels wide by `Canvas Height Minus
+One + 1` pixels high. The `Loop Count` field from the 'ANIM' Chunk controls how
 many times the animation process is repeated. This is `Loop Count - 1` for
-non-zero `Loop Count` values or infinitely if `Loop Count` is zero.
+nonzero `Loop Count` values or infinite if the `Loop Count` is zero.

-At the beginning of each loop iteration the canvas is filled using the
-background color from the 'ANIM' chunk or an application defined color.
+At the beginning of each loop iteration, the canvas is filled using the
+background color from the 'ANIM' Chunk or an application-defined color.

-'ANMF' chunks contain individual frames given in display order. Before rendering
+'ANMF' Chunks contain individual frames given in display order. Before rendering
 each frame, the previous frame's `Disposal method` is applied.

 The rendering of the decoded frame begins at the Cartesian coordinates (`2 *
-Frame X`, `2 * Frame Y`) using the top-left corner of the canvas as the origin.
+Frame X`, `2 * Frame Y`), using the top-left corner of the canvas as the origin.
 `Frame Width Minus One + 1` pixels wide by `Frame Height Minus One + 1` pixels
 high are rendered onto the canvas using the `Blending method`.

 The canvas is displayed for `Frame Duration` milliseconds. This continues until
-all frames given by 'ANMF' chunks have been displayed. A new loop iteration is
-then begun or the canvas is left in its final state if all iterations have been
+all frames given by 'ANMF' Chunks have been displayed. A new loop iteration is
+then begun, or the canvas is left in its final state if all iterations have been
 completed.

 The following pseudocode illustrates the rendering process. The notation
-_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
+_VP8X.field_ means the field in the 'VP8X' Chunk with the same description.

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-assert VP8X.flags.hasAnimation
+VP8X.flags.hasAnimation MUST be TRUE
 canvas ← new image of size VP8X.canvasWidth x VP8X.canvasHeight with
-         background color ANIM.background_color.
+         background color ANIM.background_color or
+         application-defined color.
 loop_count ← ANIM.loopCount
 dispose_method ← Dispose to background color
 if loop_count == 0:
  loop_count = ∞
 frame_params ← nil
-assert next chunk in image_data is ANMF
+next chunk in image_data is ANMF MUST be TRUE
 for loop = 0..loop_count - 1
-  clear canvas to ANIM.background_color or application defined color
+  clear canvas to ANIM.background_color or application-defined color
  until eof or non-ANMF chunk
    frame_params.frameX = Frame X
    frame_params.frameY = Frame Y
@ -794,22 +799,25 @@ for loop = 0..loop_count - 1
    frame_params.frameDuration = Frame Duration
    frame_right = frame_params.frameX + frame_params.frameWidth
    frame_bottom = frame_params.frameY + frame_params.frameHeight
-    assert VP8X.canvasWidth >= frame_right
-    assert VP8X.canvasHeight >= frame_bottom
+    VP8X.canvasWidth >= frame_right MUST be TRUE
+    VP8X.canvasHeight >= frame_bottom MUST be TRUE
    for subchunk in 'Frame Data':
      if subchunk.tag == "ALPH":
-        assert alpha subchunks not found in 'Frame Data' earlier
+        alpha subchunks not found in 'Frame Data' earlier MUST be
+          TRUE
        frame_params.alpha = alpha_data
      else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
-        assert bitstream subchunks not found in 'Frame Data' earlier
+        bitstream subchunks not found in 'Frame Data' earlier MUST
+          be TRUE
        frame_params.bitstream = bitstream_data
+    apply dispose_method.
    render frame with frame_params.alpha and frame_params.bitstream
      on canvas with top-left corner at (frame_params.frameX,
-      frame_params.frameY), using blending method
+      frame_params.frameY), using Blending method
      frame_params.blendingMethod.
    canvas contains the decoded image.
    Show the contents of the canvas for
-    frame_params.frameDuration * 1ms.
+    frame_params.frameDuration * 1 ms.
    dispose_method = frame_params.disposeMethod
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -817,7 +825,7 @@ for loop = 0..loop_count - 1
 Example File Layouts
 --------------------

-A lossy encoded image with alpha may look as follows:
+A lossy-encoded image with alpha may look as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 RIFF/WEBP
@ -826,16 +834,16 @@ RIFF/WEBP
 +- VP8 (bitstream)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-A losslessly encoded image may look as follows:
+A lossless-encoded image may look as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 RIFF/WEBP
 +- VP8X (descriptions of features used)
-+- XYZW (unknown chunk)
 +- VP8L (lossless bitstream)
+- XYZW (unknown chunk)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-A lossless image with ICC profile and XMP metadata may
+A lossless image with an ICC profile and XMP metadata may
 look as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -859,10 +867,11 @@ RIFF/WEBP
 +- EXIF (metadata)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-[vp8spec]:  https://datatracker.ietf.org/doc/html/rfc6386
 [webpllspec]: https://chromium.googlesource.com/webm/libwebp/+/HEAD/doc/webp-lossless-bitstream-spec.txt
 [iccspec]: https://www.color.org/icc_specs2.xalter
 [metadata]: https://web.archive.org/web/20180919181934/http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
+[rec601]: https://www.itu.int/rec/R-REC-BT.601
 [rfc 1166]: https://datatracker.ietf.org/doc/html/rfc1166
 [rfc 2119]: https://datatracker.ietf.org/doc/html/rfc2119
+[rfc 6386]: https://datatracker.ietf.org/doc/html/rfc6386
 [rfc 8174]: https://datatracker.ietf.org/doc/html/rfc8174
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -67,7 +67,7 @@ dwebp_LDADD += ../src/libwebp.la
 dwebp_LDADD +=$(PNG_LIBS) $(JPEG_LIBS)

 gif2webp_SOURCES = gif2webp.c gifdec.c gifdec.h
-gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(GIF_INCLUDES)
+gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(GIF_INCLUDES) -I$(top_srcdir)
 gif2webp_LDADD  =
 gif2webp_LDADD += libexample_util.la
 gif2webp_LDADD += ../imageio/libimageio_util.la
--- a/examples/anim_diff.c
+++ b/examples/anim_diff.c
@ -16,7 +16,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <stdio.h>
-#include <stdlib.h>  // for 'strtod'.
+#include <stdlib.h>
 #include <string.h>  // for 'strcmp'.

 #include "./anim_util.h"
@ -206,8 +206,9 @@ static void Help(void) {
  printf("  -version ............ print version number and exit\n");
 }

+// Returns 0 on success, 1 if animation files differ, and 2 for any error.
 int main(int argc, const char* argv[]) {
-  int return_code = -1;
+  int return_code = 2;
  int dump_frames = 0;
  const char* dump_folder = NULL;
  double min_psnr = 0.;
@ -269,18 +270,18 @@ int main(int argc, const char* argv[]) {
    }
    if (parse_error) {
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(return_code);
    }
  }
  if (argc < 3) {
    Help();
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(return_code);
  }


  if (!got_input2) {
    Help();
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(return_code);
  }

  if (dump_frames) {
@ -293,7 +294,7 @@ int main(int argc, const char* argv[]) {
    if (!ReadAnimatedImage(files[i], &images[i], dump_frames, dump_folder)) {
      WFPRINTF(stderr, "Error decoding file: %s\n Aborting.\n",
               (const W_CHAR*)files[i]);
-      return_code = -2;
+      return_code = 2;
      goto End;
    } else {
      MinimizeAnimationFrames(&images[i], max_diff);
@ -304,7 +305,7 @@ int main(int argc, const char* argv[]) {
                                premultiply, min_psnr)) {
    WFPRINTF(stderr, "\nFiles %s and %s differ.\n", (const W_CHAR*)files[0],
             (const W_CHAR*)files[1]);
-    return_code = -3;
+    return_code = 1;
  } else {
    WPRINTF("\nFiles %s and %s are identical.\n", (const W_CHAR*)files[0],
            (const W_CHAR*)files[1]);
--- a/examples/anim_dump.c
+++ b/examples/anim_dump.c
@ -12,6 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>  // for 'strcmp'.

 #include "./anim_util.h"
@ -35,6 +36,7 @@ static void Help(void) {
  printf("  -version ............ print version number and exit\n");
 }

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  int error = 0;
  const W_CHAR* dump_folder = TO_W_CHAR(".");
@ -47,7 +49,7 @@ int main(int argc, const char* argv[]) {

  if (argc < 2) {
    Help();
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  for (c = 1; !error && c < argc; ++c) {
@ -73,7 +75,7 @@ int main(int argc, const char* argv[]) {
      suffix = TO_W_CHAR("pam");
    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-version")) {
      int dec_version, demux_version;
      GetAnimatedImageVersions(&dec_version, &demux_version);
@ -82,7 +84,7 @@ int main(int argc, const char* argv[]) {
             (dec_version >> 0) & 0xff,
             (demux_version >> 16) & 0xff, (demux_version >> 8) & 0xff,
             (demux_version >> 0) & 0xff);
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else {
      uint32_t i;
      AnimatedImage image;
@ -98,7 +100,11 @@ int main(int argc, const char* argv[]) {
      for (i = 0; !error && i < image.num_frames; ++i) {
        W_CHAR out_file[1024];
        WebPDecBuffer buffer;
-        WebPInitDecBuffer(&buffer);
+        if (!WebPInitDecBuffer(&buffer)) {
+          fprintf(stderr, "Cannot init dec buffer\n");
+          error = 1;
+          continue;
+        }
        buffer.colorspace = MODE_RGBA;
        buffer.is_external_memory = 1;
        buffer.width = image.canvas_width;
@ -117,5 +123,5 @@ int main(int argc, const char* argv[]) {
      ClearAnimatedImage(&image);
    }
  }
-  FREE_WARGV_AND_RETURN(error ? 1 : 0);
+  FREE_WARGV_AND_RETURN(error ? EXIT_FAILURE : EXIT_SUCCESS);
 }
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@ -771,6 +771,7 @@ void GetDiffAndPSNR(const uint8_t rgba1[], const uint8_t rgba2[],
    *psnr = 99.;  // PSNR when images are identical.
  } else {
    sse /= stride * height;
+    assert(sse != 0.0);
    *psnr = 4.3429448 * log(255. * 255. / sse);
  }
 }
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -178,8 +178,14 @@ static void PrintFullLosslessInfo(const WebPAuxStats* const stats,
    if (stats->lossless_features & 8) fprintf(stderr, " PALETTE");
    fprintf(stderr, "\n");
  }
-  fprintf(stderr, "  * Precision Bits: histogram=%d transform=%d cache=%d\n",
-          stats->histogram_bits, stats->transform_bits, stats->cache_bits);
+  fprintf(stderr, "  * Precision Bits: histogram=%d", stats->histogram_bits);
+  if (stats->lossless_features & 1) {
+    fprintf(stderr, " prediction=%d", stats->transform_bits);
+  }
+  if (stats->lossless_features & 2) {
+    fprintf(stderr, " cross-color=%d", stats->cross_color_transform_bits);
+  }
+  fprintf(stderr, " cache=%d\n", stats->cache_bits);
  if (stats->palette_size > 0) {
    fprintf(stderr, "  * Palette size:   %d\n", stats->palette_size);
  }
@ -306,6 +312,7 @@ static int MyWriter(const uint8_t* data, size_t data_size,
 // Dumps a picture as a PGM file using the IMC4 layout.
 static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
  int y;
+  int ok = 0;
  const int uv_width = (picture->width + 1) / 2;
  const int uv_height = (picture->height + 1) / 2;
  const int stride = (picture->width + 1) & ~1;
@ -320,23 +327,26 @@ static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
  if (f == NULL) return 0;
  fprintf(f, "P5\n%d %d\n255\n", stride, height);
  for (y = 0; y < picture->height; ++y) {
-    if (fwrite(src_y, picture->width, 1, f) != 1) return 0;
+    if (fwrite(src_y, picture->width, 1, f) != 1) goto Error;
    if (picture->width & 1) fputc(0, f);  // pad
    src_y += picture->y_stride;
  }
  for (y = 0; y < uv_height; ++y) {
-    if (fwrite(src_u, uv_width, 1, f) != 1) return 0;
-    if (fwrite(src_v, uv_width, 1, f) != 1) return 0;
+    if (fwrite(src_u, uv_width, 1, f) != 1) goto Error;
+    if (fwrite(src_v, uv_width, 1, f) != 1) goto Error;
    src_u += picture->uv_stride;
    src_v += picture->uv_stride;
  }
  for (y = 0; y < alpha_height; ++y) {
-    if (fwrite(src_a, picture->width, 1, f) != 1) return 0;
+    if (fwrite(src_a, picture->width, 1, f) != 1) goto Error;
    if (picture->width & 1) fputc(0, f);  // pad
    src_a += picture->a_stride;
  }
+  ok = 1;
+
+ Error:
  fclose(f);
-  return 1;
+  return ok;
 }

 // -----------------------------------------------------------------------------
@ -647,8 +657,9 @@ static const char* const kErrorMessages[VP8_ENC_ERROR_LAST] = {

 //------------------------------------------------------------------------------

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
-  int return_value = -1;
+  int return_value = EXIT_FAILURE;
  const char* in_file = NULL, *out_file = NULL, *dump_file = NULL;
  FILE* out = NULL;
  int c;
@ -682,22 +693,22 @@ int main(int argc, const char* argv[]) {
      !WebPPictureInit(&original_picture) ||
      !WebPConfigInit(&config)) {
    fprintf(stderr, "Error! Version mismatch!\n");
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  if (argc == 1) {
    HelpShort();
-    FREE_WARGV_AND_RETURN(0);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      HelpShort();
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-H") || !strcmp(argv[c], "-longhelp")) {
      HelpLong();
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-o") && c + 1 < argc) {
      out_file = (const char*)GET_WARGV(argv, ++c);
    } else if (!strcmp(argv[c], "-d") && c + 1 < argc) {
@ -838,7 +849,7 @@ int main(int argc, const char* argv[]) {
      printf("libsharpyuv: %d.%d.%d\n",
             (sharpyuv_version >> 24) & 0xff, (sharpyuv_version >> 16) & 0xffff,
             sharpyuv_version & 0xff);
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-progress")) {
      show_progress = 1;
    } else if (!strcmp(argv[c], "-quiet")) {
@ -900,7 +911,7 @@ int main(int argc, const char* argv[]) {
        if (i == kNumTokens) {
          fprintf(stderr, "Error! Unknown metadata type '%.*s'\n",
                  (int)(token - start), start);
-          FREE_WARGV_AND_RETURN(-1);
+          FREE_WARGV_AND_RETURN(EXIT_FAILURE);
        }
        start = token + 1;
      }
@ -919,14 +930,14 @@ int main(int argc, const char* argv[]) {
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
      HelpLong();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    } else {
      in_file = (const char*)GET_WARGV(argv, c);
    }

    if (parse_error) {
      HelpLong();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    }
  }
  if (in_file == NULL) {
@ -1227,7 +1238,7 @@ int main(int argc, const char* argv[]) {
      PrintMetadataInfo(&metadata, metadata_written);
    }
  }
-  return_value = 0;
+  return_value = EXIT_SUCCESS;

 Error:
  WebPMemoryWriterClear(&memory_writer);
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -177,6 +177,7 @@ static uint8_t* AllocateExternalBuffer(WebPDecoderConfig* config,
  return external_buffer;
 }

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  int ok = 0;
  const char* in_file = NULL;
@ -197,14 +198,14 @@ int main(int argc, const char* argv[]) {

  if (!WebPInitDecoderConfig(&config)) {
    fprintf(stderr, "Library version mismatch!\n");
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
      out_file = (const char*)GET_WARGV(argv, ++c);
    } else if (!strcmp(argv[c], "-alpha")) {
@ -227,7 +228,7 @@ int main(int argc, const char* argv[]) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-pgm")) {
      format = PGM;
    } else if (!strcmp(argv[c], "-yuv")) {
@ -293,21 +294,21 @@ int main(int argc, const char* argv[]) {
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Unknown option '%s'\n", argv[c]);
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    } else {
      in_file = (const char*)GET_WARGV(argv, c);
    }

    if (parse_error) {
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    }
  }

  if (in_file == NULL) {
    fprintf(stderr, "missing input file!!\n");
    Help();
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  if (quiet) verbose = 0;
@ -316,7 +317,7 @@ int main(int argc, const char* argv[]) {
    VP8StatusCode status = VP8_STATUS_OK;
    size_t data_size = 0;
    if (!LoadWebP(in_file, &data, &data_size, bitstream)) {
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    }

    switch (format) {
@ -415,7 +416,7 @@ int main(int argc, const char* argv[]) {
  WebPFreeDecBuffer(output_buffer);
  WebPFree((void*)external_buffer);
  WebPFree((void*)data);
-  FREE_WARGV_AND_RETURN(ok ? 0 : -1);
+  FREE_WARGV_AND_RETURN(ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 //------------------------------------------------------------------------------
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -28,6 +28,7 @@
 #endif

 #include <gif_lib.h>
+#include "sharpyuv/sharpyuv.h"
 #include "webp/encode.h"
 #include "webp/mux.h"
 #include "../examples/example_util.h"
@ -70,8 +71,14 @@ static void Help(void) {
  printf("  -lossy ................. encode image using lossy compression\n");
  printf("  -mixed ................. for each frame in the image, pick lossy\n"
         "                           or lossless compression heuristically\n");
+  printf("  -near_lossless <int> ... use near-lossless image preprocessing\n"
+         "                           (0..100=off), default=100\n");
+  printf("  -sharp_yuv ............. use sharper (and slower) RGB->YUV "
+                                    "conversion\n"
+         "                           (lossy only)\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
-  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
+  printf("  -m <int> ............... compression method (0=fast, 6=slowest), "
+         "default=4\n");
  printf("  -min_size .............. minimize output size (default:off)\n"
         "                           lossless compression by default; can be\n"
         "                           combined with -q, -m, -lossy or -mixed\n"
@ -96,6 +103,7 @@ static void Help(void) {

 //------------------------------------------------------------------------------

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  int verbose = 0;
  int gif_error = GIF_ERROR;
@ -140,7 +148,7 @@ int main(int argc, const char* argv[]) {
      !WebPPictureInit(&frame) || !WebPPictureInit(&curr_canvas) ||
      !WebPPictureInit(&prev_canvas)) {
    fprintf(stderr, "Error! Version mismatch!\n");
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }
  config.lossless = 1;  // Use lossless compression by default.

@ -150,14 +158,14 @@ int main(int argc, const char* argv[]) {

  if (argc == 1) {
    Help();
-    FREE_WARGV_AND_RETURN(0);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  for (c = 1; c < argc; ++c) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
      out_file = GET_WARGV(argv, ++c);
    } else if (!strcmp(argv[c], "-lossy")) {
@ -165,6 +173,10 @@ int main(int argc, const char* argv[]) {
    } else if (!strcmp(argv[c], "-mixed")) {
      enc_options.allow_mixed = 1;
      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-near_lossless") && c < argc - 1) {
+      config.near_lossless = ExUtilGetInt(argv[++c], 0, &parse_error);
+    } else if (!strcmp(argv[c], "-sharp_yuv")) {
+      config.use_sharp_yuv = 1;
    } else if (!strcmp(argv[c], "-loop_compatibility")) {
      loop_compatibility = 1;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
@ -216,7 +228,7 @@ int main(int argc, const char* argv[]) {
          fprintf(stderr, "Error! Unknown metadata type '%.*s'\n",
                  (int)(token - start), start);
          Help();
-          FREE_WARGV_AND_RETURN(-1);
+          FREE_WARGV_AND_RETURN(EXIT_FAILURE);
        }
        start = token + 1;
      }
@ -225,11 +237,14 @@ int main(int argc, const char* argv[]) {
    } else if (!strcmp(argv[c], "-version")) {
      const int enc_version = WebPGetEncoderVersion();
      const int mux_version = WebPGetMuxVersion();
+      const int sharpyuv_version = SharpYuvGetVersion();
      printf("WebP Encoder version: %d.%d.%d\nWebP Mux version: %d.%d.%d\n",
             (enc_version >> 16) & 0xff, (enc_version >> 8) & 0xff,
             enc_version & 0xff, (mux_version >> 16) & 0xff,
             (mux_version >> 8) & 0xff, mux_version & 0xff);
-      FREE_WARGV_AND_RETURN(0);
+      printf("libsharpyuv: %d.%d.%d\n", (sharpyuv_version >> 24) & 0xff,
+             (sharpyuv_version >> 16) & 0xffff, sharpyuv_version & 0xff);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-quiet")) {
      quiet = 1;
      enc_options.verbose = 0;
@ -242,14 +257,14 @@ int main(int argc, const char* argv[]) {
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    } else {
      in_file = GET_WARGV(argv, c);
    }

    if (parse_error) {
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    }
  }

@ -593,7 +608,7 @@ int main(int argc, const char* argv[]) {
 #endif
  }

-  FREE_WARGV_AND_RETURN(!ok);
+  FREE_WARGV_AND_RETURN(ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 #else  // !WEBP_HAVE_GIF
@ -601,7 +616,7 @@ int main(int argc, const char* argv[]) {
 int main(int argc, const char* argv[]) {
  fprintf(stderr, "GIF support not enabled in %s.\n", argv[0]);
  (void)argc;
-  return 0;
+  return EXIT_FAILURE;
 }

 #endif
--- a/examples/gifdec.c
+++ b/examples/gifdec.c
@ -317,7 +317,7 @@ void GIFDisplayError(const GifFileType* const gif, int gif_error) {

 #else  // !WEBP_HAVE_GIF

-static void ErrorGIFNotAvailable() {
+static void ErrorGIFNotAvailable(void) {
  fprintf(stderr, "GIF support not compiled. Please install the libgif-dev "
          "package before building.\n");
 }
--- a/examples/img2webp.c
+++ b/examples/img2webp.c
@ -59,10 +59,15 @@ static void Help(void) {

  printf("Per-frame options (only used for subsequent images input):\n");
  printf(" -d <int> ............. frame duration in ms (default: 100)\n");
-  printf(" -lossless  ........... use lossless mode (default)\n");
-  printf(" -lossy ... ........... use lossy mode\n");
+  printf(" -lossless ............ use lossless mode (default)\n");
+  printf(" -lossy ............... use lossy mode\n");
  printf(" -q <float> ........... quality\n");
-  printf(" -m <int> ............. method to use\n");
+  printf(" -m <int> ............. compression method (0=fast, 6=slowest), "
+         "default=4\n");
+  printf(" -exact, -noexact ..... preserve or alter RGB values in transparent "
+                                  "area\n"
+         "                        (default: -noexact, may cause artifacts\n"
+         "                                  with lossy animations)\n");

  printf("\n");
  printf("example: img2webp -loop 2 in0.png -lossy in1.jpg\n"
@ -130,6 +135,7 @@ static int SetLoopCount(int loop_count, WebPData* const webp_data) {

 //------------------------------------------------------------------------------

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  const char* output = NULL;
  WebPAnimEncoder* enc = NULL;
@ -145,13 +151,14 @@ int main(int argc, const char* argv[]) {
  WebPData webp_data;
  int c;
  int have_input = 0;
+  int last_input_index = 0;
  CommandLineArguments cmd_args;
  int ok;

  INIT_WARGV(argc, argv);

  ok = ExUtilInitCommandLineArguments(argc - 1, argv + 1, &cmd_args);
-  if (!ok) FREE_WARGV_AND_RETURN(1);
+  if (!ok) FREE_WARGV_AND_RETURN(EXIT_FAILURE);

  argc = cmd_args.argc_;
  argv = cmd_args.argv_;
@ -199,7 +206,7 @@ int main(int argc, const char* argv[]) {
        verbose = 1;
      } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
        Help();
-        FREE_WARGV_AND_RETURN(0);
+        FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
      } else if (!strcmp(argv[c], "-version")) {
        const int enc_version = WebPGetEncoderVersion();
        const int mux_version = WebPGetMuxVersion();
@ -223,6 +230,8 @@ int main(int argc, const char* argv[]) {
  }
  if (!have_input) {
    fprintf(stderr, "No input file(s) for generating animation!\n");
+    ok = 0;
+    Help();
    goto End;
  }

@ -247,6 +256,10 @@ int main(int argc, const char* argv[]) {
          fprintf(stderr, "Invalid negative duration (%d)\n", duration);
          parse_error = 1;
        }
+      } else if (!strcmp(argv[c], "-exact")) {
+        config.exact = 1;
+      } else if (!strcmp(argv[c], "-noexact")) {
+        config.exact = 0;
      } else {
        parse_error = 1;   // shouldn't be here.
        fprintf(stderr, "Unknown option [%s]\n", argv[c]);
@ -267,6 +280,7 @@ int main(int argc, const char* argv[]) {
    // read next input image
    pic.use_argb = 1;
    ok = ReadImage((const char*)GET_WARGV_SHIFTED(argv, c), &pic);
+    last_input_index = c;
    if (!ok) goto End;

    if (enc == NULL) {
@ -305,6 +319,13 @@ int main(int argc, const char* argv[]) {
    ++pic_num;
  }

+  for (c = last_input_index + 1; c < argc; ++c) {
+    if (argv[c] != NULL) {
+      fprintf(stderr, "Warning: unused option [%s]!"
+                      " Frame options go before the input frame.\n", argv[c]);
+    }
+  }
+
  // add a last fake frame to signal the last duration
  ok = ok && WebPAnimEncoderAdd(enc, NULL, timestamp_ms, NULL);
  ok = ok && WebPAnimEncoderAssemble(enc, &webp_data);
@ -335,5 +356,5 @@ int main(int argc, const char* argv[]) {
  }
  WebPDataClear(&webp_data);
  ExUtilDeleteCommandLineArguments(&cmd_args);
-  FREE_WARGV_AND_RETURN(ok ? 0 : 1);
+  FREE_WARGV_AND_RETURN(ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -18,6 +18,7 @@
 #define _POSIX_C_SOURCE 200112L  // for setenv
 #endif

+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -430,10 +431,13 @@ static void HandleDisplay(void) {
 #endif
 }

-static void StartDisplay(void) {
+static void StartDisplay(const char* filename) {
  int width = kParams.canvas_width;
  int height = kParams.canvas_height;
  int screen_width, screen_height;
+  const char viewername[] = " - WebP viewer";
+  // max linux file len + viewername string
+  char title[4096 + sizeof(viewername)] = "";
  // TODO(webp:365) GLUT_DOUBLE results in flickering / old frames to be
  // partially displayed with animated webp + alpha.
 #if defined(__APPLE__) || defined(_WIN32)
@ -453,8 +457,9 @@ static void StartDisplay(void) {
      height = screen_height;
    }
  }
+  snprintf(title, sizeof(title), "%s%s", filename, viewername);
  glutInitWindowSize(width, height);
-  glutCreateWindow("WebP viewer");
+  glutCreateWindow(title);
  glutDisplayFunc(HandleDisplay);
  glutReshapeFunc(HandleReshape);
  glutIdleFunc(NULL);
@ -493,7 +498,7 @@ static void Help(void) {
 }

 int main(int argc, char* argv[]) {
-  int c;
+  int c, file_name_argv_index = 1;
  WebPDecoderConfig* const config = &kParams.config;
  WebPIterator* const curr = &kParams.curr_frame;

@ -501,7 +506,7 @@ int main(int argc, char* argv[]) {

  if (!WebPInitDecoderConfig(config)) {
    fprintf(stderr, "Library version mismatch!\n");
-    FREE_WARGV_AND_RETURN(-1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }
  config->options.dithering_strength = 50;
  config->options.alpha_dithering_strength = 100;
@ -513,7 +518,7 @@ int main(int argc, char* argv[]) {
    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-noicc")) {
      kParams.use_color_profile = 0;
    } else if (!strcmp(argv[c], "-nofancy")) {
@ -536,30 +541,34 @@ int main(int argc, char* argv[]) {
             (dec_version >> 16) & 0xff, (dec_version >> 8) & 0xff,
             dec_version & 0xff, (dmux_version >> 16) & 0xff,
             (dmux_version >> 8) & 0xff, dmux_version & 0xff);
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-mt")) {
      config->options.use_threads = 1;
    } else if (!strcmp(argv[c], "--")) {
-      if (c < argc - 1) kParams.file_name = (const char*)GET_WARGV(argv, ++c);
+      if (c < argc - 1) {
+        kParams.file_name = (const char*)GET_WARGV(argv, ++c);
+        file_name_argv_index = c;
+      }
      break;
    } else if (argv[c][0] == '-') {
      printf("Unknown option '%s'\n", argv[c]);
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    } else {
      kParams.file_name = (const char*)GET_WARGV(argv, c);
+      file_name_argv_index = c;
    }

    if (parse_error) {
      Help();
-      FREE_WARGV_AND_RETURN(-1);
+      FREE_WARGV_AND_RETURN(EXIT_FAILURE);
    }
  }

  if (kParams.file_name == NULL) {
    printf("missing input file!!\n");
    Help();
-    FREE_WARGV_AND_RETURN(0);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  if (!ImgIoUtilReadFile(kParams.file_name,
@ -613,7 +622,7 @@ int main(int argc, char* argv[]) {

  // Position iterator to last frame. Next call to HandleDisplay will wrap over.
  // We take this into account by bumping up loop_count.
-  WebPDemuxGetFrame(kParams.dmux, 0, curr);
+  if (!WebPDemuxGetFrame(kParams.dmux, 0, curr)) goto Error;
  if (kParams.loop_count) ++kParams.loop_count;

 #if defined(__unix__) || defined(__CYGWIN__)
@ -627,18 +636,18 @@ int main(int argc, char* argv[]) {
 #ifdef FREEGLUT
  glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_CONTINUE_EXECUTION);
 #endif
-  StartDisplay();
+  StartDisplay(argv[file_name_argv_index]);

  if (kParams.has_animation) glutTimerFunc(0, decode_callback, 0);
  glutMainLoop();

  // Should only be reached when using FREEGLUT:
  ClearParams();
-  FREE_WARGV_AND_RETURN(0);
+  FREE_WARGV_AND_RETURN(EXIT_SUCCESS);

 Error:
  ClearParams();
-  FREE_WARGV_AND_RETURN(-1);
+  FREE_WARGV_AND_RETURN(EXIT_FAILURE);
 }

 #else   // !WEBP_HAVE_GL
@ -646,7 +655,7 @@ int main(int argc, char* argv[]) {
 int main(int argc, const char* argv[]) {
  fprintf(stderr, "OpenGL support not enabled in %s.\n", argv[0]);
  (void)argc;
-  return 0;
+  return EXIT_FAILURE;
 }

 #endif
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
@ -14,6 +14,7 @@

 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>

 #ifdef HAVE_CONFIG_H
 #include "webp/config.h"
@ -357,12 +358,12 @@ static WebPInfoStatus ParseLossyHeader(const ChunkData* const chunk_data,
  }
  data += 3;
  data_size -= 3;
-  printf("  Key frame:        %s\n"
-         "  Profile:          %d\n"
-         "  Display:          %s\n"
-         "  Part. 0 length:   %d\n",
-         key_frame ? "Yes" : "No", profile,
-         display ? "Yes" : "No", partition0_length);
+  printf(
+      "  Key frame:        %s\n"
+      "  Profile:          %d\n"
+      "  Display:          Yes\n"
+      "  Part. 0 length:   %d\n",
+      key_frame ? "Yes" : "No", profile, partition0_length);
  if (key_frame) {
    if (!(data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a)) {
      LOG_ERROR("Invalid lossy bitstream signature.");
@ -1120,6 +1121,7 @@ static void Help(void) {
         "  -bitstream_info .... Parse bitstream header.\n");
 }

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  int c, quiet = 0, show_diag = 0, show_summary = 0;
  int parse_bitstream = 0;
@ -1130,7 +1132,7 @@ int main(int argc, const char* argv[]) {

  if (argc == 1) {
    Help();
-    FREE_WARGV_AND_RETURN(WEBP_INFO_OK);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  // Parse command-line input.
@ -1138,7 +1140,7 @@ int main(int argc, const char* argv[]) {
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help") ||
        !strcmp(argv[c], "-H") || !strcmp(argv[c], "-longhelp")) {
      Help();
-      FREE_WARGV_AND_RETURN(WEBP_INFO_OK);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else if (!strcmp(argv[c], "-quiet")) {
      quiet = 1;
    } else if (!strcmp(argv[c], "-diag")) {
@ -1151,7 +1153,7 @@ int main(int argc, const char* argv[]) {
      const int version = WebPGetDecoderVersion();
      printf("WebP Decoder version: %d.%d.%d\n",
             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else {  // Assume the remaining are all input files.
      break;
    }
@ -1159,7 +1161,7 @@ int main(int argc, const char* argv[]) {

  if (c == argc) {
    Help();
-    FREE_WARGV_AND_RETURN(WEBP_INFO_INVALID_COMMAND);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  // Process input files one by one.
@ -1182,5 +1184,6 @@ int main(int argc, const char* argv[]) {
    webp_info_status = AnalyzeWebP(&webp_info, &webp_data);
    WebPDataClear(&webp_data);
  }
-  FREE_WARGV_AND_RETURN(webp_info_status);
+  FREE_WARGV_AND_RETURN((webp_info_status == WEBP_INFO_OK) ? EXIT_SUCCESS
+                                                           : EXIT_FAILURE);
 }
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -59,6 +59,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "webp/decode.h"
 #include "webp/mux.h"
 #include "../examples/example_util.h"
@ -150,16 +151,20 @@ static const char* ErrorString(WebPMuxError err) {
 }

 #define RETURN_IF_ERROR(ERR_MSG)                                     \
-  if (err != WEBP_MUX_OK) {                                          \
-    fprintf(stderr, ERR_MSG);                                        \
-    return err;                                                      \
-  }
+  do {                                                               \
+    if (err != WEBP_MUX_OK) {                                        \
+      fprintf(stderr, ERR_MSG);                                      \
+      return err;                                                    \
+    }                                                                \
+  } while (0)

 #define RETURN_IF_ERROR3(ERR_MSG, FORMAT_STR1, FORMAT_STR2)          \
-  if (err != WEBP_MUX_OK) {                                          \
-    fprintf(stderr, ERR_MSG, FORMAT_STR1, FORMAT_STR2);              \
-    return err;                                                      \
-  }
+  do {                                                               \
+    if (err != WEBP_MUX_OK) {                                        \
+      fprintf(stderr, ERR_MSG, FORMAT_STR1, FORMAT_STR2);            \
+      return err;                                                    \
+    }                                                                \
+  } while (0)

 #define ERROR_GOTO1(ERR_MSG, LABEL)                                  \
  do {                                                               \
@ -605,20 +610,26 @@ static int ValidateCommandLine(const CommandLineArguments* const cmd_args,
 #define FEATURETYPE_IS_NIL (config->type_ == NIL_FEATURE)

 #define CHECK_NUM_ARGS_AT_LEAST(NUM, LABEL)                              \
-  if (argc < i + (NUM)) {                                                \
-    fprintf(stderr, "ERROR: Too few arguments for '%s'.\n", argv[i]);    \
-    goto LABEL;                                                          \
-  }
+  do {                                                                   \
+    if (argc < i + (NUM)) {                                              \
+      fprintf(stderr, "ERROR: Too few arguments for '%s'.\n", argv[i]);  \
+      goto LABEL;                                                        \
+    }                                                                    \
+  } while (0)

 #define CHECK_NUM_ARGS_AT_MOST(NUM, LABEL)                               \
-  if (argc > i + (NUM)) {                                                \
-    fprintf(stderr, "ERROR: Too many arguments for '%s'.\n", argv[i]);   \
-    goto LABEL;                                                          \
-  }
+  do {                                                                   \
+    if (argc > i + (NUM)) {                                              \
+      fprintf(stderr, "ERROR: Too many arguments for '%s'.\n", argv[i]); \
+      goto LABEL;                                                        \
+    }                                                                    \
+  } while (0)

 #define CHECK_NUM_ARGS_EXACTLY(NUM, LABEL)                               \
-  CHECK_NUM_ARGS_AT_LEAST(NUM, LABEL);                                   \
-  CHECK_NUM_ARGS_AT_MOST(NUM, LABEL);
+  do {                                                                   \
+    CHECK_NUM_ARGS_AT_LEAST(NUM, LABEL);                                 \
+    CHECK_NUM_ARGS_AT_MOST(NUM, LABEL);                                  \
+  } while (0)

 // Parses command-line arguments to fill up config object. Also performs some
 // semantic checks. unicode_argv contains wchar_t arguments or is null.
@ -1215,6 +1226,7 @@ static int Process(const Config* config) {
 //------------------------------------------------------------------------------
 // Main.

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  Config config;
  int ok;
@ -1228,7 +1240,7 @@ int main(int argc, const char* argv[]) {
    PrintHelp();
  }
  DeleteConfig(&config);
-  FREE_WARGV_AND_RETURN(!ok);
+  FREE_WARGV_AND_RETURN(ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 //------------------------------------------------------------------------------
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@ -7,6 +7,7 @@ noinst_HEADERS += ../src/webp/types.h

 libwebpextras_la_SOURCES =
 libwebpextras_la_SOURCES += extras.c extras.h quality_estimate.c
+libwebpextras_la_SOURCES += sharpyuv_risk_table.c sharpyuv_risk_table.h

 libwebpextras_la_CPPFLAGS = $(AM_CPPFLAGS)
 libwebpextras_la_LDFLAGS = -lm
--- a/extras/extras.c
+++ b/extras/extras.c
@ -11,15 +11,21 @@
 //

 #include "extras/extras.h"
-#include "webp/format_constants.h"
-#include "src/dsp/dsp.h"

 #include <assert.h>
+#include <limits.h>
 #include <string.h>

+#include "extras/sharpyuv_risk_table.h"
+#include "sharpyuv/sharpyuv.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/utils.h"
+#include "webp/format_constants.h"
+#include "webp/types.h"
+
 #define XTRA_MAJ_VERSION 1
-#define XTRA_MIN_VERSION 3
-#define XTRA_REV_VERSION 1
+#define XTRA_MIN_VERSION 5
+#define XTRA_REV_VERSION 0

 //------------------------------------------------------------------------------

@ -160,3 +166,159 @@ int WebPUnmultiplyARGB(WebPPicture* pic) {
 }

 //------------------------------------------------------------------------------
+// 420 risk metric
+
+#define YUV_FIX 16  // fixed-point precision for RGB->YUV
+static const int kYuvHalf = 1 << (YUV_FIX - 1);
+
+// Maps a value in [0, (256 << YUV_FIX) - 1] to [0,
+// precomputed_scores_table_sampling - 1]. It is important that the extremal
+// values are preserved and 1:1 mapped:
+//  ConvertValue(0) = 0
+//  ConvertValue((256 << 16) - 1) = rgb_sampling_size - 1
+static int SharpYuvConvertValueToSampledIdx(int v, int rgb_sampling_size) {
+  v = (v + kYuvHalf) >> YUV_FIX;
+  v = (v < 0) ? 0 : (v > 255) ? 255 : v;
+  return (v * (rgb_sampling_size - 1)) / 255;
+}
+
+#undef YUV_FIX
+
+// For each pixel, computes the index to look up that color in a precomputed
+// risk score table where the YUV space is subsampled to a size of
+// precomputed_scores_table_sampling^3 (see sharpyuv_risk_table.h)
+static int SharpYuvConvertToYuvSharpnessIndex(
+    int r, int g, int b, const SharpYuvConversionMatrix* matrix,
+    int precomputed_scores_table_sampling) {
+  const int y = SharpYuvConvertValueToSampledIdx(
+      matrix->rgb_to_y[0] * r + matrix->rgb_to_y[1] * g +
+          matrix->rgb_to_y[2] * b + matrix->rgb_to_y[3],
+      precomputed_scores_table_sampling);
+  const int u = SharpYuvConvertValueToSampledIdx(
+      matrix->rgb_to_u[0] * r + matrix->rgb_to_u[1] * g +
+          matrix->rgb_to_u[2] * b + matrix->rgb_to_u[3],
+      precomputed_scores_table_sampling);
+  const int v = SharpYuvConvertValueToSampledIdx(
+      matrix->rgb_to_v[0] * r + matrix->rgb_to_v[1] * g +
+          matrix->rgb_to_v[2] * b + matrix->rgb_to_v[3],
+      precomputed_scores_table_sampling);
+  return y + u * precomputed_scores_table_sampling +
+         v * precomputed_scores_table_sampling *
+             precomputed_scores_table_sampling;
+}
+
+static void SharpYuvRowToYuvSharpnessIndex(
+    const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr,
+    int rgb_step, int rgb_bit_depth, int width, uint16_t* dst,
+    const SharpYuvConversionMatrix* matrix,
+    int precomputed_scores_table_sampling) {
+  int i;
+  assert(rgb_bit_depth == 8);
+  (void)rgb_bit_depth;  // Unused for now.
+  for (i = 0; i < width;
+       ++i, r_ptr += rgb_step, g_ptr += rgb_step, b_ptr += rgb_step) {
+    dst[i] =
+        SharpYuvConvertToYuvSharpnessIndex(r_ptr[0], g_ptr[0], b_ptr[0], matrix,
+                                           precomputed_scores_table_sampling);
+  }
+}
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((uint64_t)(W) * (H), sizeof(T)))
+
+static int DoEstimateRisk(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                          const uint8_t* b_ptr, int rgb_step, int rgb_stride,
+                          int rgb_bit_depth, int width, int height,
+                          const SharpYuvOptions* options,
+                          const uint8_t precomputed_scores_table[],
+                          int precomputed_scores_table_sampling,
+                          float* score_out) {
+  const int sampling3 = precomputed_scores_table_sampling *
+                        precomputed_scores_table_sampling *
+                        precomputed_scores_table_sampling;
+  const int kNoiseLevel = 4;
+  double total_score = 0;
+  double count = 0;
+  // Rows of indices in
+  uint16_t* row1 = SAFE_ALLOC(width, 1, uint16_t);
+  uint16_t* row2 = SAFE_ALLOC(width, 1, uint16_t);
+  uint16_t* tmp;
+  int i, j;
+
+  if (row1 == NULL || row2 == NULL) {
+    WebPFree(row1);
+    WebPFree(row2);
+    return 0;
+  }
+
+  // Convert the first row ahead.
+  SharpYuvRowToYuvSharpnessIndex(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth,
+                                 width, row2, options->yuv_matrix,
+                                 precomputed_scores_table_sampling);
+
+  for (j = 1; j < height; ++j) {
+    r_ptr += rgb_stride;
+    g_ptr += rgb_stride;
+    b_ptr += rgb_stride;
+    // Swap row 1 and row 2.
+    tmp = row1;
+    row1 = row2;
+    row2 = tmp;
+    // Convert the row below.
+    SharpYuvRowToYuvSharpnessIndex(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth,
+                                   width, row2, options->yuv_matrix,
+                                   precomputed_scores_table_sampling);
+    for (i = 0; i < width - 1; ++i) {
+      const int idx0 = row1[i + 0];
+      const int idx1 = row1[i + 1];
+      const int idx2 = row2[i + 0];
+      const int score = precomputed_scores_table[idx0 + sampling3 * idx1] +
+                        precomputed_scores_table[idx0 + sampling3 * idx2] +
+                        precomputed_scores_table[idx1 + sampling3 * idx2];
+      if (score > kNoiseLevel) {
+        total_score += score;
+        count += 1.0;
+      }
+    }
+  }
+  if (count > 0.) total_score /= count;
+
+  // If less than 1% of pixels were evaluated -> below noise level.
+  if (100. * count / (width * height) < 1.) total_score = 0.;
+
+  // Rescale to [0:100]
+  total_score = (total_score > 25.) ? 100. : total_score * 100. / 25.;
+
+  WebPFree(row1);
+  WebPFree(row2);
+
+  *score_out = (float)total_score;
+  return 1;
+}
+
+#undef SAFE_ALLOC
+
+int SharpYuvEstimate420Risk(const void* r_ptr, const void* g_ptr,
+                            const void* b_ptr, int rgb_step, int rgb_stride,
+                            int rgb_bit_depth, int width, int height,
+                            const SharpYuvOptions* options, float* score) {
+  if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
+      r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || options == NULL ||
+      score == NULL) {
+    return 0;
+  }
+  if (rgb_bit_depth != 8) {
+    return 0;
+  }
+
+  if (width <= 4 || height <= 4) {
+    *score = 0.0f;  // too small, no real risk.
+    return 1;
+  }
+
+  return DoEstimateRisk(
+      (const uint8_t*)r_ptr, (const uint8_t*)g_ptr, (const uint8_t*)b_ptr,
+      rgb_step, rgb_stride, rgb_bit_depth, width, height, options,
+      kSharpYuvPrecomputedRisk, kSharpYuvPrecomputedRiskYuvSampling, score);
+}
+
+//------------------------------------------------------------------------------
--- a/extras/extras.h
+++ b/extras/extras.h
@ -17,9 +17,10 @@
 extern "C" {
 #endif

+#include "sharpyuv/sharpyuv.h"
 #include "webp/encode.h"

-#define WEBP_EXTRAS_ABI_VERSION 0x0002    // MAJOR(8b) + MINOR(8b)
+#define WEBP_EXTRAS_ABI_VERSION 0x0003    // MAJOR(8b) + MINOR(8b)

 //------------------------------------------------------------------------------

@ -70,6 +71,38 @@ WEBP_EXTERN int VP8EstimateQuality(const uint8_t* const data, size_t size);

 //------------------------------------------------------------------------------

+// Computes a score between 0 and 100 which represents the risk of having visual
+// quality loss from converting an RGB image to YUV420.
+// A low score, typically < 40, means there is a low risk of artifacts from
+// chroma subsampling and a simple averaging algorithm can be used instead of
+// the more expensive SharpYuvConvert function.
+// A medium score, typically >= 40 and < 70, means that simple chroma
+// subsampling will produce artifacts and it may be advisable to use the more
+// costly SharpYuvConvert for YUV420 conversion.
+// A high score, typically >= 70, means there is a very high risk of artifacts
+// from chroma subsampling even with SharpYuvConvert, and best results might be
+// achieved by using YUV444.
+// If not using SharpYuvConvert, a threshold of about 50 can be used to decide
+// between (simple averaging) 420 and 444.
+// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
+//     to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
+// rgb_step: distance in bytes between two horizontally adjacent pixels on the
+//     r, g and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_stride: distance in bytes between two vertically adjacent pixels on the
+//     r, g, and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_bit_depth: number of bits for each r/g/b value. Only a value of 8 is
+//     currently supported.
+// width, height: width and height of the image in pixels
+// Returns 0 on failure.
+WEBP_EXTERN int SharpYuvEstimate420Risk(
+    const void* r_ptr, const void* g_ptr, const void* b_ptr, int rgb_step,
+    int rgb_stride, int rgb_bit_depth, int width, int height,
+    const SharpYuvOptions* options, float* score);
+
+//------------------------------------------------------------------------------
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@ -227,10 +227,11 @@ static void Help(void) {
          WebPGetEnabledInputFileFormats());
 }

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  WebPPicture pic1, pic2;
  size_t size1 = 0, size2 = 0;
-  int ret = 1;
+  int ret = EXIT_FAILURE;
  float disto[5];
  int type = 0;
  int c;
@ -246,7 +247,7 @@ int main(int argc, const char* argv[]) {

  if (!WebPPictureInit(&pic1) || !WebPPictureInit(&pic2)) {
    fprintf(stderr, "Can't init pictures\n");
-    FREE_WARGV_AND_RETURN(1);
+    FREE_WARGV_AND_RETURN(EXIT_FAILURE);
  }

  for (c = 1; c < argc; ++c) {
@ -262,7 +263,7 @@ int main(int argc, const char* argv[]) {
      use_gray = 1;
    } else if (!strcmp(argv[c], "-h")) {
      help = 1;
-      ret = 0;
+      ret = EXIT_SUCCESS;
    } else if (!strcmp(argv[c], "-o")) {
      if (++c == argc) {
        fprintf(stderr, "missing file name after %s option.\n", argv[c - 1]);
@ -337,7 +338,8 @@ int main(int argc, const char* argv[]) {
      fprintf(stderr, "Error during lossless encoding.\n");
      goto End;
    }
-    ret = ImgIoUtilWriteFile(output, data, data_size) ? 0 : 1;
+    ret = ImgIoUtilWriteFile(output, data, data_size) ? EXIT_SUCCESS
+                                                      : EXIT_FAILURE;
    WebPFree(data);
    if (ret) goto End;
 #else
@ -345,9 +347,10 @@ int main(int argc, const char* argv[]) {
    (void)data_size;
    fprintf(stderr, "Cannot save the difference map. Please recompile "
                    "without the WEBP_REDUCE_CSP flag.\n");
+    goto End;
 #endif  // WEBP_REDUCE_CSP
  }
-  ret = 0;
+  ret = EXIT_SUCCESS;

 End:
  WebPPictureFree(&pic1);
--- a/extras/sharpyuv_risk_table.c
+++ b/extras/sharpyuv_risk_table.c
--- a/extras/sharpyuv_risk_table.h
+++ b/extras/sharpyuv_risk_table.h
@ -0,0 +1,27 @@
+// Copyright 2023 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Precomputed data for 420 risk estimation.
+
+#ifndef WEBP_EXTRAS_SHARPYUV_RISK_TABLE_H_
+#define WEBP_EXTRAS_SHARPYUV_RISK_TABLE_H_
+
+#include "src/webp/types.h"
+
+extern const int kSharpYuvPrecomputedRiskYuvSampling;
+// Table of precomputed risk scores when chroma subsampling images with two
+// given colors.
+// Since precomputing values for all possible YUV colors would create a huge
+// table, the YUV space (i.e. [0, 255]^3) is reduced to
+// [0, kSharpYuvPrecomputedRiskYuvSampling-1]^3
+// where 255 maps to kSharpYuvPrecomputedRiskYuvSampling-1.
+// Table size: kSharpYuvPrecomputedRiskYuvSampling^6 bytes or 114 KiB
+extern const uint8_t kSharpYuvPrecomputedRisk[];
+
+#endif  // WEBP_EXTRAS_SHARPYUV_RISK_TABLE_H_
--- a/extras/vwebp_sdl.c
+++ b/extras/vwebp_sdl.c
@ -15,6 +15,7 @@
 // Author: James Zern (jzern@google.com)

 #include <stdio.h>
+#include <stdlib.h>

 #ifdef HAVE_CONFIG_H
 #include "webp/config.h"
@ -30,7 +31,7 @@
 #if defined(WEBP_HAVE_JUST_SDL_H)
 #include <SDL.h>
 #else
-#include <SDL/SDL.h>
+#include <SDL2/SDL.h>
 #endif

 static void ProcessEvents(void) {
@ -49,19 +50,26 @@ static void ProcessEvents(void) {
  }
 }

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, char* argv[]) {
  int c;
  int ok = 0;

  INIT_WARGV(argc, argv);

+  if (argc == 1) {
+    fprintf(stderr, "Usage: %s [-h] image.webp [more_files.webp...]\n",
+            argv[0]);
+    goto Error;
+  }
+
  for (c = 1; c < argc; ++c) {
    const char* file = NULL;
    const uint8_t* webp = NULL;
    size_t webp_size = 0;
    if (!strcmp(argv[c], "-h")) {
      printf("Usage: %s [-h] image.webp [more_files.webp...]\n", argv[0]);
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else {
      file = (const char*)GET_WARGV(argv, c);
    }
@ -87,7 +95,7 @@ int main(int argc, char* argv[]) {

 Error:
  SDL_Quit();
-  FREE_WARGV_AND_RETURN(ok ? 0 : 1);
+  FREE_WARGV_AND_RETURN(ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 #else  // !WEBP_HAVE_SDL
--- a/extras/webp_quality.c
+++ b/extras/webp_quality.c
@ -15,6 +15,7 @@
 #include "imageio/imageio_util.h"
 #include "../examples/unicode.h"

+// Returns EXIT_SUCCESS on success, EXIT_FAILURE on failure.
 int main(int argc, const char* argv[]) {
  int c;
  int quiet = 0;
@ -27,7 +28,7 @@ int main(int argc, const char* argv[]) {
      quiet = 1;
    } else if (!strcmp(argv[c], "-help") || !strcmp(argv[c], "-h")) {
      printf("webp_quality [-h][-quiet] webp_files...\n");
-      FREE_WARGV_AND_RETURN(0);
+      FREE_WARGV_AND_RETURN(EXIT_SUCCESS);
    } else {
      const char* const filename = (const char*)GET_WARGV(argv, c);
      const uint8_t* data = NULL;
@ -50,5 +51,5 @@ int main(int argc, const char* argv[]) {
      free((void*)data);
    }
  }
-  FREE_WARGV_AND_RETURN(ok ? 0 : 1);
+  FREE_WARGV_AND_RETURN(ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/extras/webp_to_sdl.c
+++ b/extras/webp_to_sdl.c
@ -20,88 +20,75 @@
 #include "webp_to_sdl.h"

 #include <stdio.h>
+
 #include "src/webp/decode.h"

 #if defined(WEBP_HAVE_JUST_SDL_H)
 #include <SDL.h>
 #else
-#include <SDL/SDL.h>
+#include <SDL2/SDL.h>
 #endif

 static int init_ok = 0;
 int WebPToSDL(const char* data, unsigned int data_size) {
  int ok = 0;
  VP8StatusCode status;
-  WebPDecoderConfig config;
-  WebPBitstreamFeatures* const input = &config.input;
-  WebPDecBuffer* const output = &config.output;
-  SDL_Surface* screen = NULL;
-  SDL_Surface* surface = NULL;
-
-  if (!WebPInitDecoderConfig(&config)) {
-    fprintf(stderr, "Library version mismatch!\n");
-    return 0;
-  }
+  WebPBitstreamFeatures input;
+  uint8_t* output = NULL;
+  SDL_Window* window = NULL;
+  SDL_Renderer* renderer = NULL;
+  SDL_Texture* texture = NULL;
+  int width, height;

  if (!init_ok) {
    SDL_Init(SDL_INIT_VIDEO);
    init_ok = 1;
  }

-  status = WebPGetFeatures((uint8_t*)data, (size_t)data_size, &config.input);
+  status = WebPGetFeatures((uint8_t*)data, (size_t)data_size, &input);
  if (status != VP8_STATUS_OK) goto Error;
+  width = input.width;
+  height = input.height;

-  screen = SDL_SetVideoMode(input->width, input->height, 32, SDL_SWSURFACE);
-  if (screen == NULL) {
-    fprintf(stderr, "Unable to set video mode (32bpp %dx%d)!\n",
-            input->width, input->height);
+  SDL_CreateWindowAndRenderer(width, height, 0, &window, &renderer);
+  if (window == NULL || renderer == NULL) {
+    fprintf(stderr, "Unable to create window or renderer!\n");
    goto Error;
  }
+  SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY,
+              "linear");  // make the scaled rendering look smoother.
+  SDL_RenderSetLogicalSize(renderer, width, height);

-  surface = SDL_CreateRGBSurface(SDL_SWSURFACE,
-                                 input->width, input->height, 32,
-                                 0x000000ffu,   // R mask
-                                 0x0000ff00u,   // G mask
-                                 0x00ff0000u,   // B mask
-                                 0xff000000u);  // A mask
-
-  if (surface == NULL) {
-    fprintf(stderr, "Unable to create %dx%d RGBA surface!\n",
-            input->width, input->height);
+  texture = SDL_CreateTexture(renderer, SDL_PIXELFORMAT_ABGR8888,
+                              SDL_TEXTUREACCESS_STREAMING, width, height);
+  if (texture == NULL) {
+    fprintf(stderr, "Unable to create %dx%d RGBA texture!\n", width, height);
    goto Error;
  }
-  if (SDL_MUSTLOCK(surface)) SDL_LockSurface(surface);

 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
-  output->colorspace = MODE_BGRA;
+  output = WebPDecodeBGRA((const uint8_t*)data, (size_t)data_size, &width,
+                          &height);
 #else
-  output->colorspace = MODE_RGBA;
+  output = WebPDecodeRGBA((const uint8_t*)data, (size_t)data_size, &width,
+                          &height);
 #endif
-  output->width  = surface->w;
-  output->height = surface->h;
-  output->u.RGBA.rgba   = surface->pixels;
-  output->u.RGBA.stride = surface->pitch;
-  output->u.RGBA.size   = surface->pitch * surface->h;
-  output->is_external_memory = 1;
-
-  status = WebPDecode((const uint8_t*)data, (size_t)data_size, &config);
-  if (status != VP8_STATUS_OK) {
+  if (output == NULL) {
    fprintf(stderr, "Error decoding image (%d)\n", status);
    goto Error;
  }

-  if (SDL_MUSTLOCK(surface)) SDL_UnlockSurface(surface);
-  if (SDL_BlitSurface(surface, NULL, screen, NULL) ||
-      SDL_Flip(screen)) {
-    goto Error;
-  }
-
+  SDL_UpdateTexture(texture, NULL, output, width * sizeof(uint32_t));
+  SDL_RenderClear(renderer);
+  SDL_RenderCopy(renderer, texture, NULL, NULL);
+  SDL_RenderPresent(renderer);
  ok = 1;

 Error:
-  SDL_FreeSurface(surface);
-  SDL_FreeSurface(screen);
-  WebPFreeDecBuffer(output);
+  // We should call SDL_DestroyWindow(window) but that makes .js fail.
+  SDL_DestroyRenderer(renderer);
+  SDL_DestroyTexture(texture);
+  WebPFree(output);
  return ok;
 }

--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@ -260,14 +260,20 @@ int WebPWritePAM(FILE* fout, const WebPDecBuffer* const buffer) {

 // Save 16b mode (RGBA4444, RGB565, ...) for debugging purpose.
 int WebPWrite16bAsPGM(FILE* fout, const WebPDecBuffer* const buffer) {
-  const uint32_t width = buffer->width;
-  const uint32_t height = buffer->height;
-  const uint8_t* rgba = buffer->u.RGBA.rgba;
-  const int stride = buffer->u.RGBA.stride;
+  uint32_t width, height;
+  uint8_t* rgba;
+  int stride;
  const uint32_t bytes_per_px = 2;
  uint32_t y;

-  if (fout == NULL || buffer == NULL || rgba == NULL) return 0;
+  if (fout == NULL || buffer == NULL) return 0;
+
+  width = buffer->width;
+  height = buffer->height;
+  rgba = buffer->u.RGBA.rgba;
+  stride = buffer->u.RGBA.stride;
+
+  if (rgba == NULL) return 0;

  fprintf(fout, "P5\n%u %u\n255\n", width * bytes_per_px, height);
  for (y = 0; y < height; ++y) {
@ -295,22 +301,29 @@ static void PutLE32(uint8_t* const dst, uint32_t value) {
 #define BMP_HEADER_SIZE 54
 #define BMP_HEADER_ALPHA_EXTRA_SIZE 16  // for alpha info
 int WebPWriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
-  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
-  const int header_size =
-      BMP_HEADER_SIZE + (has_alpha ? BMP_HEADER_ALPHA_EXTRA_SIZE : 0);
-  const uint32_t width = buffer->width;
-  const uint32_t height = buffer->height;
-  const uint8_t* rgba = buffer->u.RGBA.rgba;
-  const int stride = buffer->u.RGBA.stride;
-  const uint32_t bytes_per_px = has_alpha ? 4 : 3;
+  int has_alpha, header_size;
+  uint32_t width, height;
+  uint8_t* rgba;
+  int stride;
  uint32_t y;
-  const uint32_t line_size = bytes_per_px * width;
-  const uint32_t bmp_stride = (line_size + 3) & ~3;   // pad to 4
-  const uint32_t image_size = bmp_stride * height;
-  const uint32_t total_size =  image_size + header_size;
+  uint32_t bytes_per_px, line_size, image_size, bmp_stride, total_size;
  uint8_t bmp_header[BMP_HEADER_SIZE + BMP_HEADER_ALPHA_EXTRA_SIZE] = { 0 };

-  if (fout == NULL || buffer == NULL || rgba == NULL) return 0;
+  if (fout == NULL || buffer == NULL) return 0;
+
+  has_alpha = WebPIsAlphaMode(buffer->colorspace);
+  header_size = BMP_HEADER_SIZE + (has_alpha ? BMP_HEADER_ALPHA_EXTRA_SIZE : 0);
+  width = buffer->width;
+  height = buffer->height;
+  rgba = buffer->u.RGBA.rgba;
+  stride = buffer->u.RGBA.stride;
+  bytes_per_px = has_alpha ? 4 : 3;
+  line_size = bytes_per_px * width;
+  bmp_stride = (line_size + 3) & ~3;  // pad to 4
+  image_size = bmp_stride * height;
+  total_size = image_size + header_size;
+
+  if (rgba == NULL) return 0;

  // bitmap file header
  PutLE16(bmp_header + 0, 0x4d42);                // signature 'BM'
@ -372,17 +385,14 @@ int WebPWriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
 #define TIFF_HEADER_SIZE (EXTRA_DATA_OFFSET + EXTRA_DATA_SIZE)

 int WebPWriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
-  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
-  const uint32_t width = buffer->width;
-  const uint32_t height = buffer->height;
-  const uint8_t* rgba = buffer->u.RGBA.rgba;
-  const int stride = buffer->u.RGBA.stride;
-  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
-  const uint8_t assoc_alpha =
-      WebPIsPremultipliedMode(buffer->colorspace) ? 1 : 2;
+  int has_alpha;
+  uint32_t width, height;
+  uint8_t* rgba;
+  int stride;
+  uint8_t bytes_per_px = 0;
+  const uint8_t assoc_alpha = 0;
  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
-  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
-                                            : NUM_IFD_ENTRIES - 1;
+  const uint8_t num_ifd_entries = 0;
  uint8_t tiff_header[TIFF_HEADER_SIZE] = {
    0x49, 0x49, 0x2a, 0x00,   // little endian signature
    8, 0, 0, 0,               // offset to the unique IFD that follows
@ -416,7 +426,20 @@ int WebPWriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
  };
  uint32_t y;

-  if (fout == NULL || buffer == NULL || rgba == NULL) return 0;
+  if (fout == NULL || buffer == NULL) return 0;
+
+  has_alpha = WebPIsAlphaMode(buffer->colorspace);
+  width = buffer->width;
+  height = buffer->height;
+  rgba = buffer->u.RGBA.rgba;
+  stride = buffer->u.RGBA.stride;
+
+  if (rgba == NULL) return 0;
+
+  // Update bytes_per_px, num_ifd_entries and assoc_alpha.
+  tiff_header[38] = tiff_header[102] = bytes_per_px = has_alpha ? 4 : 3;
+  tiff_header[8] = has_alpha ? NUM_IFD_ENTRIES : NUM_IFD_ENTRIES - 1;
+  tiff_header[186] = WebPIsPremultipliedMode(buffer->colorspace) ? 1 : 2;

  // Fill placeholders in IFD:
  PutLE32(tiff_header + 10 + 8, width);
--- a/imageio/imageio_util.c
+++ b/imageio/imageio_util.c
@ -89,6 +89,11 @@ int ImgIoUtilReadFile(const char* const file_name,
  }
  fseek(in, 0, SEEK_END);
  file_size = ftell(in);
+  if (file_size == (size_t)-1) {
+    fclose(in);
+    WFPRINTF(stderr, "error getting size of '%s'\n", (const W_CHAR*)file_name);
+    return 0;
+  }
  fseek(in, 0, SEEK_SET);
  // we allocate one extra byte for the \0 terminator
  file_data = (uint8_t*)WebPMalloc(file_size + 1);
--- a/imageio/jpegdec.c
+++ b/imageio/jpegdec.c
@ -206,8 +206,18 @@ struct my_error_mgr {

 static void my_error_exit(j_common_ptr dinfo) {
  struct my_error_mgr* myerr = (struct my_error_mgr*)dinfo->err;
+  // The following code is disabled in fuzzing mode because:
+  // - the logs can be flooded due to invalid JPEG files
+  // - msg_code is wrongfully seen as uninitialized by msan when the libjpeg
+  //   dependency is not built with sanitizers enabled
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  const int msg_code = myerr->pub.msg_code;
  fprintf(stderr, "libjpeg error: ");
  dinfo->err->output_message(dinfo);
+  if (msg_code == JERR_INPUT_EOF || msg_code == JERR_FILE_READ) {
+    fprintf(stderr, "`jpegtran -copy all` MAY be able to process this file.\n");
+  }
+#endif
  longjmp(myerr->setjmp_buffer, 1);
 }

--- a/imageio/pngdec.c
+++ b/imageio/pngdec.c
@ -139,6 +139,8 @@ static const struct {
  { "Raw profile type xmp",  ProcessRawProfile, METADATA_OFFSET(xmp) },
  // Exiftool puts exif data in APP1 chunk, too.
  { "Raw profile type APP1", ProcessRawProfile, METADATA_OFFSET(exif) },
+  // ImageMagick uses lowercase app1.
+  { "Raw profile type app1", ProcessRawProfile, METADATA_OFFSET(exif) },
  // XMP Specification Part 3, Section 3 #PNG
  { "XML:com.adobe.xmp",     MetadataCopy,      METADATA_OFFSET(xmp) },
  { NULL, NULL, 0 },
@ -159,6 +161,20 @@ static int ExtractMetadataFromPNG(png_structp png,
    png_textp text = NULL;
    const png_uint_32 num = png_get_text(png, info, &text, NULL);
    png_uint_32 i;
+
+#ifdef PNG_eXIf_SUPPORTED
+    // Look for an 'eXIf' tag. Preference is given to this tag as it's newer
+    // than the TextualData tags.
+    {
+      png_bytep exif;
+      png_uint_32 len;
+
+      if (png_get_eXIf_1(png, info, &len, &exif) == PNG_INFO_eXIf) {
+        if (!MetadataCopy((const char*)exif, len, &metadata->exif)) return 0;
+      }
+    }
+#endif  // PNG_eXIf_SUPPORTED
+
    // Look for EXIF / XMP metadata.
    for (i = 0; i < num; ++i, ++text) {
      int j;
@ -192,6 +208,7 @@ static int ExtractMetadataFromPNG(png_structp png,
        }
      }
    }
+#ifdef PNG_iCCP_SUPPORTED
    // Look for an ICC profile.
    {
      png_charp name;
@ -208,6 +225,7 @@ static int ExtractMetadataFromPNG(png_structp png,
        if (!MetadataCopy((const char*)profile, len, &metadata->iccp)) return 0;
      }
    }
+#endif  // PNG_iCCP_SUPPORTED
  }
  return 1;
 }
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -41,6 +41,7 @@ readonly TARGETDIR="${TOPDIR}/WebP.framework"
 readonly DECTARGETDIR="${TOPDIR}/WebPDecoder.framework"
 readonly MUXTARGETDIR="${TOPDIR}/WebPMux.framework"
 readonly DEMUXTARGETDIR="${TOPDIR}/WebPDemux.framework"
+readonly SHARPYUVTARGETDIR="${TOPDIR}/SharpYuv.framework"
 readonly DEVELOPER=$(xcode-select --print-path)
 readonly PLATFORMSROOT="${DEVELOPER}/Platforms"
 readonly LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
@ -52,7 +53,7 @@ DEMUXLIBLIST=''
 if [[ -z "${SDK}" ]]; then
  echo "iOS SDK not available"
  exit 1
-elif [[ ${SDK%%.*} -gt 8 ]]; then
+elif [[ ${SDK%%.*} -gt 8 && "${XCODE%%.*}" -lt 16 ]]; then
  EXTRA_CFLAGS="-fembed-bitcode"
 elif [[ ${SDK%%.*} -le 6 ]]; then
  echo "You need iOS SDK version 6.0 or above"
@ -63,7 +64,8 @@ echo "Xcode Version: ${XCODE}"
 echo "iOS SDK Version: ${SDK}"

 if [[ -e "${BUILDDIR}" || -e "${TARGETDIR}" || -e "${DECTARGETDIR}" \
-      || -e "${MUXTARGETDIR}" || -e "${DEMUXTARGETDIR}" ]]; then
+      || -e "${MUXTARGETDIR}" || -e "${DEMUXTARGETDIR}" \
+      || -e "${SHARPYUVTARGETDIR}" ]]; then
  cat << EOF
 WARNING: The following directories will be deleted:
 WARNING:   ${BUILDDIR}
@ -71,14 +73,16 @@ WARNING:   ${TARGETDIR}
 WARNING:   ${DECTARGETDIR}
 WARNING:   ${MUXTARGETDIR}
 WARNING:   ${DEMUXTARGETDIR}
+WARNING:   ${SHARPYUVTARGETDIR}
 WARNING: The build will continue in 5 seconds...
 EOF
  sleep 5
 fi
 rm -rf ${BUILDDIR} ${TARGETDIR} ${DECTARGETDIR} \
-    ${MUXTARGETDIR} ${DEMUXTARGETDIR}
+    ${MUXTARGETDIR} ${DEMUXTARGETDIR} ${SHARPYUVTARGETDIR}
 mkdir -p ${BUILDDIR} ${TARGETDIR}/Headers/ ${DECTARGETDIR}/Headers/ \
-    ${MUXTARGETDIR}/Headers/ ${DEMUXTARGETDIR}/Headers/
+    ${MUXTARGETDIR}/Headers/ ${DEMUXTARGETDIR}/Headers/ \
+    ${SHARPYUVTARGETDIR}/Headers/

 if [[ ! -e ${SRCDIR}/configure ]]; then
  if ! (cd ${SRCDIR} && sh autogen.sh); then
@ -134,13 +138,14 @@ for PLATFORM in ${PLATFORMS}; do
  set +x

  # Build only the libraries, skip the examples.
-  make V=0 -C sharpyuv
+  make V=0 -C sharpyuv install
  make V=0 -C src install

  LIBLIST+=" ${ROOTDIR}/lib/libwebp.a"
  DECLIBLIST+=" ${ROOTDIR}/lib/libwebpdecoder.a"
  MUXLIBLIST+=" ${ROOTDIR}/lib/libwebpmux.a"
  DEMUXLIBLIST+=" ${ROOTDIR}/lib/libwebpdemux.a"
+  SHARPYUVLIBLIST+=" ${ROOTDIR}/lib/libsharpyuv.a"

  make clean

@ -165,4 +170,9 @@ cp -a ${SRCDIR}/src/webp/{decode,types,mux_types,demux}.h \
    ${DEMUXTARGETDIR}/Headers/
 ${LIPO} -create ${DEMUXLIBLIST} -output ${DEMUXTARGETDIR}/WebPDemux

+echo "SHARPYUVLIBLIST = ${SHARPYUVLIBLIST}"
+cp -a ${SRCDIR}/sharpyuv/{sharpyuv,sharpyuv_csp}.h \
+    ${SHARPYUVTARGETDIR}/Headers/
+${LIPO} -create ${SHARPYUVLIBLIST} -output ${SHARPYUVTARGETDIR}/SharpYuv
+
 echo  "SUCCESS"
--- a/makefile.unix
+++ b/makefile.unix
@ -37,13 +37,13 @@ else
 endif

 # SDL flags: use sdl-config if it exists
-SDL_CONFIG = $(shell sdl-config --version 2> /dev/null)
+SDL_CONFIG = $(shell sdl2-config --version 2> /dev/null)
 ifneq ($(SDL_CONFIG),)
-  SDL_LIBS = $(shell sdl-config --libs)
-  SDL_FLAGS = $(shell sdl-config --cflags)
+  SDL_LIBS = $(shell sdl2-config --libs)
+  SDL_FLAGS = $(shell sdl2-config --cflags)
 else
  # use best-guess
-  SDL_LIBS = -lSDL
+  SDL_LIBS = -lSDL2
  SDL_FLAGS =
 endif

@ -276,6 +276,7 @@ UTILS_DEC_OBJS = \
    src/utils/color_cache_utils.o \
    src/utils/filters_utils.o \
    src/utils/huffman_utils.o \
+    src/utils/palette.o \
    src/utils/quant_levels_dec_utils.o \
    src/utils/random_utils.o \
    src/utils/rescaler_utils.o \
@ -290,6 +291,7 @@ UTILS_ENC_OBJS = \
 EXTRA_OBJS = \
    extras/extras.o \
    extras/quality_estimate.o \
+    extras/sharpyuv_risk_table.o \

 LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
 LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) \
@ -343,6 +345,7 @@ HDRS = \
    src/utils/filters_utils.h \
    src/utils/huffman_utils.h \
    src/utils/huffman_encode_utils.h \
+    src/utils/palette.h \
    src/utils/quant_levels_utils.h \
    src/utils/quant_levels_dec_utils.h \
    src/utils/random_utils.h \
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "March 17, 2022"
+.TH CWEBP 1 "September 17, 2024"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -135,7 +135,9 @@ are used, \fB\-size\fP value will prevail.
 Set a maximum number of passes to use during the dichotomy used by
 options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10, default is 1.
 If options \fB\-size\fP or \fB\-psnr\fP were used, but \fB\-pass\fP wasn't
-specified, a default value of '6' passes will be used.
+specified, a default value of '6' passes will be used. If \fB\-pass\fP is
+specified, but neither \fB-size\fP nor \fB-psnr\fP are, a target PSNR of 40dB
+will be used.
 .TP
 .BI \-qrange " int int
 Specifies the permissible interval for the quality factor. This is particularly
@ -178,8 +180,8 @@ Disable strong filtering (if filtering is being used thanks to the
 \fB\-f\fP option) and use simple filtering instead.
 .TP
 .B \-sharp_yuv
-Use more accurate and sharper RGB->YUV conversion if needed. Note that this
-process is slower than the default 'fast' RGB->YUV conversion.
+Use more accurate and sharper RGB->YUV conversion. Note that this process is
+slower than the default 'fast' RGB->YUV conversion.
 .TP
 .BI \-sns " int
 Specify the amplitude of the spatial noise shaping. Spatial noise shaping
@ -202,7 +204,8 @@ In the VP8 format, the so\-called control partition has a limit of 512k and
 is used to store the following information: whether the macroblock is skipped,
 which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
 mode, and finally the prediction modes to use for each of the sub\-blocks.
-For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
+For a very large image, 512k only leaves room for a few bits per 16x16
+macroblock.
 The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
 information can use up almost all these 4 bits (although the case is unlikely),
 which is problematic for very large images. The partition_limit factor controls
@ -211,7 +214,8 @@ useful in case the 512k limit is reached and the following message is displayed:
 \fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
 If using \fB\-partition_limit\fP is not enough to meet the 512k constraint, one
 should use less segments in order to save more header bits per macroblock.
-See the \fB\-segments\fP option.
+See the \fB\-segments\fP option. Note the \fB-m\fP and \fB-q\fP options also
+influence the encoder's decisions and ability to hit this limit.

 .SS LOGGING OPTIONS
 These options control the level of output:
@ -295,12 +299,12 @@ Note: each input format may not support all combinations.
 .B \-noasm
 Disable all assembly optimizations.

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
+.SH EXIT STATUS
+If there were no problems during execution, \fBcwebp\fP exits with the value of
+the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBcwebp\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLES
 cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
@ -320,6 +324,13 @@ https://chromium.googlesource.com/webm/libwebp
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR dwebp (1),
 .BR gif2webp (1)
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "November 17, 2021"
+.TH DWEBP 1 "July 18, 2024"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -108,12 +108,12 @@ Print extra information (decoding time in particular).
 .B \-noasm
 Disable all assembly optimizations.

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
+.SH EXIT STATUS
+If there were no problems during execution, \fBdwebp\fP exits with the value of
+the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBdwebp\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLES
 dwebp picture.webp \-o output.png
@ -133,6 +133,13 @@ https://chromium.googlesource.com/webm/libwebp
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR cwebp (1),
 .BR gif2webp (1),
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "November 17, 2021"
+.TH GIF2WEBP 1 "November 4, 2024"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -39,6 +39,18 @@ Encode the image using lossy compression.
 Mixed compression mode: optimize compression of the image by picking either
 lossy or lossless compression for each frame heuristically.
 .TP
+.BI \-near_lossless " int
+Specify the level of near\-lossless image preprocessing. This option adjusts
+pixel values to help compressibility, but has minimal impact on the visual
+quality. It triggers lossless compression mode automatically. The range is 0
+(maximum preprocessing) to 100 (no preprocessing, the default). The typical
+value is around 60. Note that lossy with \fB\-q 100\fP can at times yield
+better results.
+.TP
+.B \-sharp_yuv
+Use more accurate and sharper RGB->YUV conversion. Note that this process is
+slower than the default 'fast' RGB->YUV conversion.
+.TP
 .BI \-q " float
 Specify the compression factor for RGB channels between 0 and 100. The default
 is 75.
@ -126,12 +138,12 @@ Print extra information.
 .B \-quiet
 Do not print anything.

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
+.SH EXIT STATUS
+If there were no problems during execution, \fBgif2webp\fP exits with the value
+of the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBgif2webp\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLES
 gif2webp picture.gif \-o picture.webp
@ -155,6 +167,13 @@ https://chromium.googlesource.com/webm/libwebp
 This manual page was written by Urvang Joshi <urvang@google.com>, for the
 Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR cwebp (1),
 .BR dwebp (1),
--- a/man/img2webp.1
+++ b/man/img2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH IMG2WEBP 1 "March 17, 2023"
+.TH IMG2WEBP 1 "November 26, 2024"
 .SH NAME
 img2webp \- create animated WebP file from a sequence of input images.
 .SH SYNOPSIS
@ -53,8 +53,8 @@ value is around 60. Note that lossy with \fB\-q 100\fP can at times yield
 better results.
 .TP
 .B \-sharp_yuv
-Use more accurate and sharper RGB->YUV conversion if needed. Note that this
-process is slower than the default 'fast' RGB->YUV conversion.
+Use more accurate and sharper RGB->YUV conversion. Note that this process is
+slower than the default 'fast' RGB->YUV conversion.
 .TP
 .BI \-loop " int
 Specifies the number of times the animation should loop. Using '0'
@ -88,18 +88,27 @@ Specify the compression factor between 0 and 100. The default is 75.
 Specify the compression method to use. This parameter controls the
 trade off between encoding speed and the compressed file size and quality.
 Possible values range from 0 to 6. Default value is 4.
+When higher values are used, the encoder will spend more time inspecting
+additional encoding possibilities and decide on the quality gain.
+Lower value can result in faster processing time at the expense of
+larger file size and lower compression quality.
+.TP
+.B \-exact, \-noexact
+Preserve or alter RGB values in transparent area. The default is
+\fB-noexact\fP, to help compressibility. Note \fB\-noexact\fP may cause
+artifacts in frames compressed with \fB\-lossy\fP.
+
+.SH EXIT STATUS
+If there were no problems during execution, \fBimg2webp\fP exits with the value
+of the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBimg2webp\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLE
 img2webp -loop 2 in0.png -lossy in1.jpg -d 80 in2.tiff -o out.webp
 .br

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
-
 .SH AUTHORS
 \fBimg2webp\fP is a part of libwebp and was written by the WebP team.
 .br
@ -109,6 +118,13 @@ https://chromium.googlesource.com/webm/libwebp
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR webpmux (1),
 .BR gif2webp (1)
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH VWEBP 1 "November 17, 2021"
+.TH VWEBP 1 "July 18, 2024"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
@ -72,12 +72,12 @@ Disable blending and disposal process, for debugging purposes.
 .B 'q' / 'Q' / ESC
 Quit.

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
+.SH EXIT STATUS
+If there were no problems during execution, \fBvwebp\fP exits with the value of
+the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBvwebp\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLES
 vwebp picture.webp
@ -94,6 +94,13 @@ https://chromium.googlesource.com/webm/libwebp
 .PP
 This manual page was written for the Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR dwebp (1)
 .br
--- a/man/webpinfo.1
+++ b/man/webpinfo.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPINFO 1 "November 17, 2021"
+.TH WEBPINFO 1 "July 18, 2024"
 .SH NAME
 webpinfo \- print out the chunk level structure of WebP files
 along with basic integrity checks.
@ -47,12 +47,12 @@ Detailed usage instructions.
 Input files in WebP format. Input files must come last, following
 options (if any). There can be multiple input files.

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
+.SH EXIT STATUS
+If there were no problems during execution, \fBwebpinfo\fP exits with the value
+of the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBwebpinfo\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLES
 .br
@ -73,6 +73,13 @@ https://chromium.googlesource.com/webm/libwebp
 This manual page was written by Hui Su <huisu@google.com>,
 for the Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR webpmux (1)
 .br
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "November 17, 2021"
+.TH WEBPMUX 1 "July 18, 2024"
 .SH NAME
 webpmux \- create animated WebP files from non\-animated WebP images, extract
 frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
@ -186,12 +186,12 @@ Output file in WebP format.
 .TP
 The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.

-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-https://www.webmproject.org/code/contribute/submitting\-patches/
+.SH EXIT STATUS
+If there were no problems during execution, \fBwebpmux\fP exits with the value
+of the C constant \fBEXIT_SUCCESS\fP. This is usually zero.
+.PP
+If an error occurs, \fBwebpmux\fP exits with the value of the C constant
+\fBEXIT_FAILURE\fP. This is usually one.

 .SH EXAMPLES
 .P
@ -262,6 +262,13 @@ https://chromium.googlesource.com/webm/libwebp
 This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
 for the Debian project (and may be used by others).

+.SH REPORTING BUGS
+Please report all bugs to the issue tracker:
+https://issues.webmproject.org
+.br
+Patches welcome! See this page to get started:
+https://www.webmproject.org/code/contribute/submitting\-patches/
+
 .SH SEE ALSO
 .BR cwebp (1),
 .BR dwebp (1),
--- a/sharpyuv/Makefile.am
+++ b/sharpyuv/Makefile.am
@ -33,7 +33,7 @@ libsharpyuv_la_SOURCES += sharpyuv_gamma.c sharpyuv_gamma.h
 libsharpyuv_la_SOURCES += sharpyuv.c sharpyuv.h

 libsharpyuv_la_CPPFLAGS = $(AM_CPPFLAGS)
-libsharpyuv_la_LDFLAGS = -no-undefined -version-info 0:1:0 -lm
+libsharpyuv_la_LDFLAGS = -no-undefined -version-info 1:1:1 -lm
 libsharpyuv_la_LIBADD =
 libsharpyuv_la_LIBADD += libsharpyuv_sse2.la
 libsharpyuv_la_LIBADD += libsharpyuv_neon.la
--- a/sharpyuv/libsharpyuv.rc
+++ b/sharpyuv/libsharpyuv.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,0,2,1
- PRODUCTVERSION 0,0,2,1
+ FILEVERSION 0,0,4,1
+ PRODUCTVERSION 0,0,4,1
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libsharpyuv DLL"
-            VALUE "FileVersion", "0.2.1"
+            VALUE "FileVersion", "0.4.1"
            VALUE "InternalName", "libsharpyuv.dll"
-            VALUE "LegalCopyright", "Copyright (C) 2023"
+            VALUE "LegalCopyright", "Copyright (C) 2024"
            VALUE "OriginalFilename", "libsharpyuv.dll"
            VALUE "ProductName", "SharpYuv Library"
-            VALUE "ProductVersion", "0.2.1"
+            VALUE "ProductVersion", "0.4.1"
        END
    END
    BLOCK "VarFileInfo"
--- a/sharpyuv/sharpyuv.c
+++ b/sharpyuv/sharpyuv.c
@ -75,41 +75,48 @@ static int RGBToGray(int64_t r, int64_t g, int64_t b) {
 }

 static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
-                          int rgb_bit_depth) {
+                          int rgb_bit_depth,
+                          SharpYuvTransferFunctionType transfer_type) {
  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
-  const uint32_t A = SharpYuvGammaToLinear(a, bit_depth);
-  const uint32_t B = SharpYuvGammaToLinear(b, bit_depth);
-  const uint32_t C = SharpYuvGammaToLinear(c, bit_depth);
-  const uint32_t D = SharpYuvGammaToLinear(d, bit_depth);
-  return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth);
+  const uint32_t A = SharpYuvGammaToLinear(a, bit_depth, transfer_type);
+  const uint32_t B = SharpYuvGammaToLinear(b, bit_depth, transfer_type);
+  const uint32_t C = SharpYuvGammaToLinear(c, bit_depth, transfer_type);
+  const uint32_t D = SharpYuvGammaToLinear(d, bit_depth, transfer_type);
+  return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth,
+                               transfer_type);
 }

 static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
-                                int rgb_bit_depth) {
+                                int rgb_bit_depth,
+                                SharpYuvTransferFunctionType transfer_type) {
  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
-  int i;
-  for (i = 0; i < w; ++i) {
-    const uint32_t R = SharpYuvGammaToLinear(src[0 * w + i], bit_depth);
-    const uint32_t G = SharpYuvGammaToLinear(src[1 * w + i], bit_depth);
-    const uint32_t B = SharpYuvGammaToLinear(src[2 * w + i], bit_depth);
+  int i = 0;
+  do {
+    const uint32_t R =
+        SharpYuvGammaToLinear(src[0 * w + i], bit_depth, transfer_type);
+    const uint32_t G =
+        SharpYuvGammaToLinear(src[1 * w + i], bit_depth, transfer_type);
+    const uint32_t B =
+        SharpYuvGammaToLinear(src[2 * w + i], bit_depth, transfer_type);
    const uint32_t Y = RGBToGray(R, G, B);
-    dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth);
-  }
+    dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth, transfer_type);
+  } while (++i < w);
 }

 static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
-                         fixed_t* dst, int uv_w, int rgb_bit_depth) {
-  int i;
-  for (i = 0; i < uv_w; ++i) {
+                         fixed_t* dst, int uv_w, int rgb_bit_depth,
+                         SharpYuvTransferFunctionType transfer_type) {
+  int i = 0;
+  do {
    const int r =
        ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
-                  src2[0 * uv_w + 1], rgb_bit_depth);
+                  src2[0 * uv_w + 1], rgb_bit_depth, transfer_type);
    const int g =
        ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
-                  src2[2 * uv_w + 1], rgb_bit_depth);
+                  src2[2 * uv_w + 1], rgb_bit_depth, transfer_type);
    const int b =
        ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
-                  src2[4 * uv_w + 1], rgb_bit_depth);
+                  src2[4 * uv_w + 1], rgb_bit_depth, transfer_type);
    const int W = RGBToGray(r, g, b);
    dst[0 * uv_w] = (fixed_t)(r - W);
    dst[1 * uv_w] = (fixed_t)(g - W);
@ -117,15 +124,15 @@ static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
    dst  += 1;
    src1 += 2;
    src2 += 2;
-  }
+  } while (++i < uv_w);
 }

 static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
-  int i;
+  int i = 0;
  assert(w > 0);
-  for (i = 0; i < w; ++i) {
+  do {
    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
-  }
+  } while (++i < w);
 }

 //------------------------------------------------------------------------------
@ -151,9 +158,9 @@ static void ImportOneRow(const uint8_t* const r_ptr,
  // Convert the rgb_step from a number of bytes to a number of uint8_t or
  // uint16_t values depending the bit depth.
  const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
-  int i;
+  int i = 0;
  const int w = (pic_width + 1) & ~1;
-  for (i = 0; i < pic_width; ++i) {
+  do {
    const int off = i * step;
    const int shift = GetPrecisionShift(rgb_bit_depth);
    if (rgb_bit_depth == 8) {
@ -165,7 +172,7 @@ static void ImportOneRow(const uint8_t* const r_ptr,
      dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
      dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
    }
-  }
+  } while (++i < pic_width);
  if (pic_width & 1) {  // replicate rightmost pixel
    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
@ -233,8 +240,11 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
  const int sfix = GetPrecisionShift(rgb_bit_depth);
  const int yuv_max = (1 << yuv_bit_depth) - 1;

-  for (best_uv = best_uv_base, j = 0; j < height; ++j) {
-    for (i = 0; i < width; ++i) {
+  best_uv = best_uv_base;
+  j = 0;
+  do {
+    i = 0;
+    do {
      const int off = (i >> 1);
      const int W = best_y[i];
      const int r = best_uv[off + 0 * uv_w] + W;
@ -246,19 +256,22 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
      } else {
        ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
      }
-    }
+    } while (++i < width);
    best_y += w;
    best_uv += (j & 1) * 3 * uv_w;
    y_ptr += y_stride;
-  }
-  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
-    for (i = 0; i < uv_w; ++i) {
-      const int off = i;
+  } while (++j < height);
+
+  best_uv = best_uv_base;
+  j = 0;
+  do {
+    i = 0;
+    do {
      // Note r, g and b values here are off by W, but a constant offset on all
      // 3 components doesn't change the value of u and v with a YCbCr matrix.
-      const int r = best_uv[off + 0 * uv_w];
-      const int g = best_uv[off + 1 * uv_w];
-      const int b = best_uv[off + 2 * uv_w];
+      const int r = best_uv[i + 0 * uv_w];
+      const int g = best_uv[i + 1 * uv_w];
+      const int b = best_uv[i + 2 * uv_w];
      const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
      const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
      if (yuv_bit_depth <= 8) {
@ -268,11 +281,11 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
        ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
        ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
      }
-    }
+    } while (++i < uv_w);
    best_uv += 3 * uv_w;
    u_ptr += u_stride;
    v_ptr += v_stride;
-  }
+  } while (++j < uv_h);
  return 1;
 }

@ -285,7 +298,7 @@ static void* SafeMalloc(uint64_t nmemb, size_t size) {
  return malloc((size_t)total_size);
 }

-#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))
+#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((uint64_t)(W) * (H), sizeof(T)))

 static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
                            const uint8_t* b_ptr, int rgb_step, int rgb_stride,
@ -293,12 +306,14 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
                            uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
                            int v_stride, int yuv_bit_depth, int width,
                            int height,
-                            const SharpYuvConversionMatrix* yuv_matrix) {
+                            const SharpYuvConversionMatrix* yuv_matrix,
+                            SharpYuvTransferFunctionType transfer_type) {
  // we expand the right/bottom border if needed
  const int w = (width + 1) & ~1;
  const int h = (height + 1) & ~1;
  const int uv_w = w >> 1;
  const int uv_h = h >> 1;
+  const int y_bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
  uint64_t prev_diff_y_sum = ~0;
  int j, iter;

@ -346,9 +361,9 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
    StoreGray(src1, best_y + 0, w);
    StoreGray(src2, best_y + w, w);

-    UpdateW(src1, target_y, w, rgb_bit_depth);
-    UpdateW(src2, target_y + w, w, rgb_bit_depth);
-    UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth);
+    UpdateW(src1, target_y, w, rgb_bit_depth, transfer_type);
+    UpdateW(src2, target_y + w, w, rgb_bit_depth, transfer_type);
+    UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth, transfer_type);
    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
    best_y += 2 * w;
    best_uv += 3 * uv_w;
@ -369,7 +384,8 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
    best_uv = best_uv_base;
    target_y = target_y_base;
    target_uv = target_uv_base;
-    for (j = 0; j < h; j += 2) {
+    j = 0;
+    do {
      fixed_y_t* const src1 = tmp_buffer + 0 * w;
      fixed_y_t* const src2 = tmp_buffer + 3 * w;
      {
@ -380,21 +396,21 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
        cur_uv = next_uv;
      }

-      UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth);
-      UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth);
-      UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth);
+      UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth, transfer_type);
+      UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth, transfer_type);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth, transfer_type);

      // update two rows of Y and one row of RGB
      diff_y_sum +=
-          SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
-                          rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
+          SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w, y_bit_depth);
      SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);

      best_y += 2 * w;
      best_uv += 3 * uv_w;
      target_y += 2 * w;
      target_uv += 3 * uv_w;
-    }
+      j += 2;
+    } while (j < h);
    // test exit condition
    if (iter > 0) {
      if (diff_y_sum < diff_y_threshold) break;
@ -418,6 +434,7 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
  free(tmp_buffer);
  return ok;
 }
+
 #undef SAFE_ALLOC

 #if defined(WEBP_USE_THREAD) && !defined(_WIN32)
@ -462,12 +479,42 @@ void SharpYuvInit(VP8CPUInfo cpu_info_func) {
  UNLOCK_ACCESS_AND_RETURN;
 }

-int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
-                    const void* b_ptr, int rgb_step, int rgb_stride,
-                    int rgb_bit_depth, void* y_ptr, int y_stride,
-                    void* u_ptr, int u_stride, void* v_ptr,
-                    int v_stride, int yuv_bit_depth, int width,
+int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
+                    int rgb_step, int rgb_stride, int rgb_bit_depth,
+                    void* y_ptr, int y_stride, void* u_ptr, int u_stride,
+                    void* v_ptr, int v_stride, int yuv_bit_depth, int width,
                    int height, const SharpYuvConversionMatrix* yuv_matrix) {
+  SharpYuvOptions options;
+  options.yuv_matrix = yuv_matrix;
+  options.transfer_type = kSharpYuvTransferFunctionSrgb;
+  return SharpYuvConvertWithOptions(
+      r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, rgb_bit_depth, y_ptr, y_stride,
+      u_ptr, u_stride, v_ptr, v_stride, yuv_bit_depth, width, height, &options);
+}
+
+int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix* yuv_matrix,
+                                SharpYuvOptions* options, int version) {
+  const int major = (version >> 24);
+  const int minor = (version >> 16) & 0xff;
+  if (options == NULL || yuv_matrix == NULL ||
+      (major == SHARPYUV_VERSION_MAJOR && major == 0 &&
+       minor != SHARPYUV_VERSION_MINOR) ||
+      (major != SHARPYUV_VERSION_MAJOR)) {
+    return 0;
+  }
+  options->yuv_matrix = yuv_matrix;
+  options->transfer_type = kSharpYuvTransferFunctionSrgb;
+  return 1;
+}
+
+int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
+                               const void* b_ptr, int rgb_step, int rgb_stride,
+                               int rgb_bit_depth, void* y_ptr, int y_stride,
+                               void* u_ptr, int u_stride, void* v_ptr,
+                               int v_stride, int yuv_bit_depth, int width,
+                               int height, const SharpYuvOptions* options) {
+  const SharpYuvConversionMatrix* yuv_matrix = options->yuv_matrix;
+  SharpYuvTransferFunctionType transfer_type = options->transfer_type;
  SharpYuvConversionMatrix scaled_matrix;
  const int rgb_max = (1 << rgb_bit_depth) - 1;
  const int rgb_round = 1 << (rgb_bit_depth - 1);
@ -486,7 +533,7 @@ int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
  if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
    return 0;
  }
-  if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) {
+  if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride % 2 != 0)) {
    // Step/stride should be even for uint16_t buffers.
    return 0;
  }
@ -518,10 +565,11 @@ int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
  scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
  scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);

-  return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
-                          rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
-                          v_ptr, v_stride, yuv_bit_depth, width, height,
-                          &scaled_matrix);
+  return DoSharpArgbToYuv(
+      (const uint8_t*)r_ptr, (const uint8_t*)g_ptr, (const uint8_t*)b_ptr,
+      rgb_step, rgb_stride, rgb_bit_depth, (uint8_t*)y_ptr, y_stride,
+      (uint8_t*)u_ptr, u_stride, (uint8_t*)v_ptr, v_stride, yuv_bit_depth,
+      width, height, &scaled_matrix, transfer_type);
 }

 //------------------------------------------------------------------------------
--- a/sharpyuv/sharpyuv.h
+++ b/sharpyuv/sharpyuv.h
@ -22,21 +22,36 @@ extern "C" {
 #else
 // This explicitly marks library functions and allows for changing the
 // signature for e.g., Windows DLL builds.
-#if defined(__GNUC__) && __GNUC__ >= 4
+#if defined(_WIN32) && defined(WEBP_DLL)
+#define SHARPYUV_EXTERN __declspec(dllexport)
+#elif defined(__GNUC__) && __GNUC__ >= 4
 #define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
 #else
-#if defined(_MSC_VER) && defined(WEBP_DLL)
-#define SHARPYUV_EXTERN __declspec(dllexport)
-#else
 #define SHARPYUV_EXTERN extern
-#endif /* _MSC_VER && WEBP_DLL */
-#endif /* __GNUC__ >= 4 */
+#endif /* defined(_WIN32) && defined(WEBP_DLL) */
 #endif /* WEBP_EXTERN */
 #endif /* SHARPYUV_EXTERN */

+#ifndef SHARPYUV_INLINE
+#ifdef WEBP_INLINE
+#define SHARPYUV_INLINE WEBP_INLINE
+#else
+#ifndef _MSC_VER
+#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#define SHARPYUV_INLINE inline
+#else
+#define SHARPYUV_INLINE
+#endif
+#else
+#define SHARPYUV_INLINE __forceinline
+#endif /* _MSC_VER */
+#endif /* WEBP_INLINE */
+#endif /* SHARPYUV_INLINE */
+
 // SharpYUV API version following the convention from semver.org
 #define SHARPYUV_VERSION_MAJOR 0
-#define SHARPYUV_VERSION_MINOR 2
+#define SHARPYUV_VERSION_MINOR 4
 #define SHARPYUV_VERSION_PATCH 1
 // Version as a uint32_t. The major number is the high 8 bits.
 // The minor number is the middle 8 bits. The patch number is the low 16 bits.
@ -51,16 +66,50 @@ extern "C" {
 SHARPYUV_EXTERN int SharpYuvGetVersion(void);

 // RGB to YUV conversion matrix, in 16 bit fixed point.
-// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
-// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
-// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
-// Then y, u and v values are divided by 1<<16 and rounded.
+// y_ = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
+// u_ = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
+// v_ = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
+// Then the values are divided by 1<<16 and rounded.
+// y = (y_ + (1 << 15)) >> 16
+// u = (u_ + (1 << 15)) >> 16
+// v = (v_ + (1 << 15)) >> 16
+//
+// Typically, the offset values rgb_to_y[3], rgb_to_u[3] and rgb_to_v[3] depend
+// on the input's bit depth, e.g., rgb_to_u[3] = 1 << (rgb_bit_depth - 1 + 16).
+// See also sharpyuv_csp.h to get a predefined matrix or generate a matrix.
 typedef struct {
  int rgb_to_y[4];
  int rgb_to_u[4];
  int rgb_to_v[4];
 } SharpYuvConversionMatrix;

+typedef struct SharpYuvOptions SharpYuvOptions;
+
+// Enums for transfer functions, as defined in H.273,
+// https://www.itu.int/rec/T-REC-H.273-202107-I/en
+typedef enum SharpYuvTransferFunctionType {
+  // 0 is reserved
+  kSharpYuvTransferFunctionBt709 = 1,
+  // 2 is unspecified
+  // 3 is reserved
+  kSharpYuvTransferFunctionBt470M = 4,
+  kSharpYuvTransferFunctionBt470Bg = 5,
+  kSharpYuvTransferFunctionBt601 = 6,
+  kSharpYuvTransferFunctionSmpte240 = 7,
+  kSharpYuvTransferFunctionLinear = 8,
+  kSharpYuvTransferFunctionLog100 = 9,
+  kSharpYuvTransferFunctionLog100_Sqrt10 = 10,
+  kSharpYuvTransferFunctionIec61966 = 11,
+  kSharpYuvTransferFunctionBt1361 = 12,
+  kSharpYuvTransferFunctionSrgb = 13,
+  kSharpYuvTransferFunctionBt2020_10Bit = 14,
+  kSharpYuvTransferFunctionBt2020_12Bit = 15,
+  kSharpYuvTransferFunctionSmpte2084 = 16,  // PQ
+  kSharpYuvTransferFunctionSmpte428 = 17,
+  kSharpYuvTransferFunctionHlg = 18,
+  kSharpYuvTransferFunctionNum
+} SharpYuvTransferFunctionType;
+
 // Converts RGB to YUV420 using a downsampling algorithm that minimizes
 // artefacts caused by chroma subsampling.
 // This is slower than standard downsampling (averaging of 4 UV values).
@ -85,6 +134,10 @@ typedef struct {
 //     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
 //     should be multiples of 2.
 // width, height: width and height of the image in pixels
+// yuv_matrix: RGB to YUV conversion matrix. The matrix values typically
+//     depend on the input's rgb_bit_depth.
+// This function calls SharpYuvConvertWithOptions with a default transfer
+// function of kSharpYuvTransferFunctionSrgb.
 SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
                                    const void* b_ptr, int rgb_step,
                                    int rgb_stride, int rgb_bit_depth,
@ -93,6 +146,31 @@ SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
                                    int yuv_bit_depth, int width, int height,
                                    const SharpYuvConversionMatrix* yuv_matrix);

+struct SharpYuvOptions {
+  // This matrix cannot be NULL and can be initialized by
+  // SharpYuvComputeConversionMatrix.
+  const SharpYuvConversionMatrix* yuv_matrix;
+  SharpYuvTransferFunctionType transfer_type;
+};
+
+// Internal, version-checked, entry point
+SHARPYUV_EXTERN int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix*,
+                                                SharpYuvOptions*, int);
+
+// Should always be called, to initialize a fresh SharpYuvOptions
+// structure before modification. SharpYuvOptionsInit() must have succeeded
+// before using the 'options' object.
+static SHARPYUV_INLINE int SharpYuvOptionsInit(
+    const SharpYuvConversionMatrix* yuv_matrix, SharpYuvOptions* options) {
+  return SharpYuvOptionsInitInternal(yuv_matrix, options, SHARPYUV_VERSION);
+}
+
+SHARPYUV_EXTERN int SharpYuvConvertWithOptions(
+    const void* r_ptr, const void* g_ptr, const void* b_ptr, int rgb_step,
+    int rgb_stride, int rgb_bit_depth, void* y_ptr, int y_stride, void* u_ptr,
+    int u_stride, void* v_ptr, int v_stride, int yuv_bit_depth, int width,
+    int height, const SharpYuvOptions* options);
+
 // TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
 // support (it's rarely used in practice, especially for images).

--- a/sharpyuv/sharpyuv_csp.c
+++ b/sharpyuv/sharpyuv_csp.c
@ -22,16 +22,16 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
  const float kr = yuv_color_space->kr;
  const float kb = yuv_color_space->kb;
  const float kg = 1.0f - kr - kb;
-  const float cr = 0.5f / (1.0f - kb);
-  const float cb = 0.5f / (1.0f - kr);
+  const float cb = 0.5f / (1.0f - kb);
+  const float cr = 0.5f / (1.0f - kr);

  const int shift = yuv_color_space->bit_depth - 8;

  const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
  float scale_y = 1.0f;
  float add_y = 0.0f;
-  float scale_u = cr;
-  float scale_v = cb;
+  float scale_u = cb;
+  float scale_v = cr;
  float add_uv = (float)(128 << shift);
  assert(yuv_color_space->bit_depth >= 8);

@ -59,31 +59,35 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
 }

 // Matrices are in YUV_FIX fixed point precision.
-// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
+// WebP's matrix, similar but not identical to kRec601LimitedMatrix
+// Derived using the following formulas:
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
 static const SharpYuvConversionMatrix kWebpMatrix = {
  {16839, 33059, 6420, 16 << 16},
  {-9719, -19081, 28800, 128 << 16},
  {28800, -24116, -4684, 128 << 16},
 };
-// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
+// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
 static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
  {16829, 33039, 6416, 16 << 16},
  {-9714, -19071, 28784, 128 << 16},
  {28784, -24103, -4681, 128 << 16},
 };
-// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
+// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
 static const SharpYuvConversionMatrix kRec601FullMatrix = {
  {19595, 38470, 7471, 0},
  {-11058, -21710, 32768, 128 << 16},
  {32768, -27439, -5329, 128 << 16},
 };
-// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
+// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
 static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
  {11966, 40254, 4064, 16 << 16},
  {-6596, -22189, 28784, 128 << 16},
  {28784, -26145, -2639, 128 << 16},
 };
-// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
+// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
 static const SharpYuvConversionMatrix kRec709FullMatrix = {
  {13933, 46871, 4732, 0},
  {-7509, -25259, 32768, 128 << 16},
--- a/sharpyuv/sharpyuv_csp.h
+++ b/sharpyuv/sharpyuv_csp.h
@ -41,10 +41,15 @@ SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(

 // Enums for precomputed conversion matrices.
 typedef enum {
+  // WebP's matrix, similar but not identical to kSharpYuvMatrixRec601Limited
  kSharpYuvMatrixWebp = 0,
+  // Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
  kSharpYuvMatrixRec601Limited,
+  // Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
  kSharpYuvMatrixRec601Full,
+  // Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
  kSharpYuvMatrixRec709Limited,
+  // Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
  kSharpYuvMatrixRec709Full,
  kSharpYuvMatrixNum
 } SharpYuvMatrixType;
--- a/sharpyuv/sharpyuv_dsp.c
+++ b/sharpyuv/sharpyuv_dsp.c
@ -17,6 +17,7 @@
 #include <stdlib.h>

 #include "sharpyuv/sharpyuv_cpu.h"
+#include "src/webp/types.h"

 //-----------------------------------------------------------------------------

@ -69,8 +70,7 @@ uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
 void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
                          int len);
 void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
-                          const uint16_t* best_y, uint16_t* out,
-                          int bit_depth);
+                          const uint16_t* best_y, uint16_t* out, int bit_depth);

 extern VP8CPUInfo SharpYuvGetCPUInfo;
 extern void InitSharpYuvSSE2(void);
--- a/sharpyuv/sharpyuv_gamma.c
+++ b/sharpyuv/sharpyuv_gamma.c
@ -12,6 +12,7 @@
 #include "sharpyuv/sharpyuv_gamma.h"

 #include <assert.h>
+#include <float.h>
 #include <math.h>

 #include "src/webp/types.h"
@ -97,7 +98,7 @@ static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
  return result;
 }

-uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth) {
+static uint32_t ToLinearSrgb(uint16_t v, int bit_depth) {
  const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
  if (shift > 0) {
    return kGammaToLinearTabS[v << shift];
@ -105,9 +106,314 @@ uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth) {
  return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
 }

-uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth) {
+static uint16_t FromLinearSrgb(uint32_t value, int bit_depth) {
  return FixedPointInterpolation(
      value, kLinearToGammaTabS,
      (GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
      bit_depth - GAMMA_TO_LINEAR_BITS);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+#define CLAMP(x, low, high) \
+  (((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+static WEBP_INLINE float Roundf(float x) {
+  if (x < 0)
+    return (float)ceil((double)(x - 0.5f));
+  else
+    return (float)floor((double)(x + 0.5f));
+}
+
+static WEBP_INLINE float Powf(float base, float exp) {
+  return (float)pow((double)base, (double)exp);
+}
+
+static WEBP_INLINE float Log10f(float x) { return (float)log10((double)x); }
+
+static float ToLinear709(float gamma) {
+  if (gamma < 0.f) {
+    return 0.f;
+  } else if (gamma < 4.5f * 0.018053968510807f) {
+    return gamma / 4.5f;
+  } else if (gamma < 1.f) {
+    return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
+  }
+  return 1.f;
+}
+
+static float FromLinear709(float linear) {
+  if (linear < 0.f) {
+    return 0.f;
+  } else if (linear < 0.018053968510807f) {
+    return linear * 4.5f;
+  } else if (linear < 1.f) {
+    return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
+  }
+  return 1.f;
+}
+
+static float ToLinear470M(float gamma) {
+  return Powf(CLAMP(gamma, 0.f, 1.f), 2.2f);
+}
+
+static float FromLinear470M(float linear) {
+  return Powf(CLAMP(linear, 0.f, 1.f), 1.f / 2.2f);
+}
+
+static float ToLinear470Bg(float gamma) {
+  return Powf(CLAMP(gamma, 0.f, 1.f), 2.8f);
+}
+
+static float FromLinear470Bg(float linear) {
+  return Powf(CLAMP(linear, 0.f, 1.f), 1.f / 2.8f);
+}
+
+static float ToLinearSmpte240(float gamma) {
+  if (gamma < 0.f) {
+    return 0.f;
+  } else if (gamma < 4.f * 0.022821585529445f) {
+    return gamma / 4.f;
+  } else if (gamma < 1.f) {
+    return Powf((gamma + 0.111572195921731f) / 1.111572195921731f, 1.f / 0.45f);
+  }
+  return 1.f;
+}
+
+static float FromLinearSmpte240(float linear) {
+  if (linear < 0.f) {
+    return 0.f;
+  } else if (linear < 0.022821585529445f) {
+    return linear * 4.f;
+  } else if (linear < 1.f) {
+    return 1.111572195921731f * Powf(linear, 0.45f) - 0.111572195921731f;
+  }
+  return 1.f;
+}
+
+static float ToLinearLog100(float gamma) {
+  // The function is non-bijective so choose the middle of [0, 0.01].
+  const float mid_interval = 0.01f / 2.f;
+  return (gamma <= 0.0f) ? mid_interval
+                          : Powf(10.0f, 2.f * (MIN(gamma, 1.f) - 1.0f));
+}
+
+static float FromLinearLog100(float linear) {
+  return (linear < 0.01f) ? 0.0f : 1.0f + Log10f(MIN(linear, 1.f)) / 2.0f;
+}
+
+static float ToLinearLog100Sqrt10(float gamma) {
+  // The function is non-bijective so choose the middle of [0, 0.00316227766f[.
+  const float mid_interval = 0.00316227766f / 2.f;
+  return (gamma <= 0.0f) ? mid_interval
+                          : Powf(10.0f, 2.5f * (MIN(gamma, 1.f) - 1.0f));
+}
+
+static float FromLinearLog100Sqrt10(float linear) {
+  return (linear < 0.00316227766f) ? 0.0f
+                                  : 1.0f + Log10f(MIN(linear, 1.f)) / 2.5f;
+}
+
+static float ToLinearIec61966(float gamma) {
+  if (gamma <= -4.5f * 0.018053968510807f) {
+    return Powf((-gamma + 0.09929682680944f) / -1.09929682680944f, 1.f / 0.45f);
+  } else if (gamma < 4.5f * 0.018053968510807f) {
+    return gamma / 4.5f;
+  }
+  return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
+}
+
+static float FromLinearIec61966(float linear) {
+  if (linear <= -0.018053968510807f) {
+    return -1.09929682680944f * Powf(-linear, 0.45f) + 0.09929682680944f;
+  } else if (linear < 0.018053968510807f) {
+    return linear * 4.5f;
+  }
+  return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
+}
+
+static float ToLinearBt1361(float gamma) {
+  if (gamma < -0.25f) {
+    return -0.25f;
+  } else if (gamma < 0.f) {
+    return Powf((gamma - 0.02482420670236f) / -0.27482420670236f, 1.f / 0.45f) /
+           -4.f;
+  } else if (gamma < 4.5f * 0.018053968510807f) {
+    return gamma / 4.5f;
+  } else if (gamma < 1.f) {
+    return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
+  }
+  return 1.f;
+}
+
+static float FromLinearBt1361(float linear) {
+  if (linear < -0.25f) {
+    return -0.25f;
+  } else if (linear < 0.f) {
+    return -0.27482420670236f * Powf(-4.f * linear, 0.45f) + 0.02482420670236f;
+  } else if (linear < 0.018053968510807f) {
+    return linear * 4.5f;
+  } else if (linear < 1.f) {
+    return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
+  }
+  return 1.f;
+}
+
+static float ToLinearPq(float gamma) {
+  if (gamma > 0.f) {
+    const float pow_gamma = Powf(gamma, 32.f / 2523.f);
+    const float num = MAX(pow_gamma - 107.f / 128.f, 0.0f);
+    const float den = MAX(2413.f / 128.f - 2392.f / 128.f * pow_gamma, FLT_MIN);
+    return Powf(num / den, 4096.f / 653.f);
+  }
+  return 0.f;
+}
+
+static float FromLinearPq(float linear) {
+  if (linear > 0.f) {
+    const float pow_linear = Powf(linear, 653.f / 4096.f);
+    const float num = 107.f / 128.f + 2413.f / 128.f * pow_linear;
+    const float den = 1.0f + 2392.f / 128.f * pow_linear;
+    return Powf(num / den, 2523.f / 32.f);
+  }
+  return 0.f;
+}
+
+static float ToLinearSmpte428(float gamma) {
+  return Powf(MAX(gamma, 0.f), 2.6f) / 0.91655527974030934f;
+}
+
+static float FromLinearSmpte428(float linear) {
+  return Powf(0.91655527974030934f * MAX(linear, 0.f), 1.f / 2.6f);
+}
+
+// Conversion in BT.2100 requires RGB info. Simplify to gamma correction here.
+static float ToLinearHlg(float gamma) {
+  if (gamma < 0.f) {
+    return 0.f;
+  } else if (gamma <= 0.5f) {
+    return Powf((gamma * gamma) * (1.f / 3.f), 1.2f);
+  }
+  return Powf((expf((gamma - 0.55991073f) / 0.17883277f) + 0.28466892f) / 12.0f,
+              1.2f);
+}
+
+static float FromLinearHlg(float linear) {
+  linear = Powf(linear, 1.f / 1.2f);
+  if (linear < 0.f) {
+    return 0.f;
+  } else if (linear <= (1.f / 12.f)) {
+    return sqrtf(3.f * linear);
+  }
+  return 0.17883277f * logf(12.f * linear - 0.28466892f) + 0.55991073f;
+}
+
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type) {
+  float v_float, linear;
+  if (transfer_type == kSharpYuvTransferFunctionSrgb) {
+    return ToLinearSrgb(v, bit_depth);
+  }
+  v_float = (float)v / ((1 << bit_depth) - 1);
+  switch (transfer_type) {
+    case kSharpYuvTransferFunctionBt709:
+    case kSharpYuvTransferFunctionBt601:
+    case kSharpYuvTransferFunctionBt2020_10Bit:
+    case kSharpYuvTransferFunctionBt2020_12Bit:
+      linear = ToLinear709(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470M:
+      linear = ToLinear470M(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470Bg:
+      linear = ToLinear470Bg(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte240:
+      linear = ToLinearSmpte240(v_float);
+      break;
+    case kSharpYuvTransferFunctionLinear:
+      return v;
+    case kSharpYuvTransferFunctionLog100:
+      linear = ToLinearLog100(v_float);
+      break;
+    case kSharpYuvTransferFunctionLog100_Sqrt10:
+      linear = ToLinearLog100Sqrt10(v_float);
+      break;
+    case kSharpYuvTransferFunctionIec61966:
+      linear = ToLinearIec61966(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt1361:
+      linear = ToLinearBt1361(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte2084:
+      linear = ToLinearPq(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte428:
+      linear = ToLinearSmpte428(v_float);
+      break;
+    case kSharpYuvTransferFunctionHlg:
+      linear = ToLinearHlg(v_float);
+      break;
+    default:
+      assert(0);
+      linear = 0;
+      break;
+  }
+  return (uint32_t)Roundf(linear * ((1 << 16) - 1));
+}
+
+uint16_t SharpYuvLinearToGamma(uint32_t v, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type) {
+  float v_float, linear;
+  if (transfer_type == kSharpYuvTransferFunctionSrgb) {
+    return FromLinearSrgb(v, bit_depth);
+  }
+  v_float = (float)v / ((1 << 16) - 1);
+  switch (transfer_type) {
+    case kSharpYuvTransferFunctionBt709:
+    case kSharpYuvTransferFunctionBt601:
+    case kSharpYuvTransferFunctionBt2020_10Bit:
+    case kSharpYuvTransferFunctionBt2020_12Bit:
+      linear = FromLinear709(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470M:
+      linear = FromLinear470M(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470Bg:
+      linear = FromLinear470Bg(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte240:
+      linear = FromLinearSmpte240(v_float);
+      break;
+    case kSharpYuvTransferFunctionLinear:
+      return v;
+    case kSharpYuvTransferFunctionLog100:
+      linear = FromLinearLog100(v_float);
+      break;
+    case kSharpYuvTransferFunctionLog100_Sqrt10:
+      linear = FromLinearLog100Sqrt10(v_float);
+      break;
+    case kSharpYuvTransferFunctionIec61966:
+      linear = FromLinearIec61966(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt1361:
+      linear = FromLinearBt1361(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte2084:
+      linear = FromLinearPq(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte428:
+      linear = FromLinearSmpte428(v_float);
+      break;
+    case kSharpYuvTransferFunctionHlg:
+      linear = FromLinearHlg(v_float);
+      break;
+    default:
+      assert(0);
+      linear = 0;
+      break;
+  }
+  return (uint16_t)Roundf(linear * ((1 << bit_depth) - 1));
+}
--- a/sharpyuv/sharpyuv_gamma.h
+++ b/sharpyuv/sharpyuv_gamma.h
@ -12,6 +12,7 @@
 #ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
 #define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_

+#include "sharpyuv/sharpyuv.h"
 #include "src/webp/types.h"

 #ifdef __cplusplus
@ -22,11 +23,13 @@ extern "C" {
 // SharpYuvGammaToLinear or SharpYuvLinearToGamma.
 void SharpYuvInitGammaTables(void);

-// Converts a gamma color value on 'bit_depth' bits to a 16 bit linear value.
-uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth);
+// Converts a 'bit_depth'-bit gamma color value to a 16-bit linear value.
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type);

-// Converts a 16 bit linear color value to a gamma value on 'bit_depth' bits.
-uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth);
+// Converts a 16-bit linear color value to a 'bit_depth'-bit gamma value.
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type);

 #ifdef __cplusplus
 }  // extern "C"
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -36,7 +36,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 8:7:1
+libwebp_la_LDFLAGS = -no-undefined -version-info 8:10:1
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -48,7 +48,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 4:7:1
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 4:10:1
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/alpha_dec.c
+++ b/src/dec/alpha_dec.c
@ -13,18 +13,20 @@

 #include <stdlib.h>
 #include "src/dec/alphai_dec.h"
+#include "src/dec/vp8_dec.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/dec/vp8li_dec.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/quant_levels_dec_utils.h"
 #include "src/utils/utils.h"
 #include "src/webp/format_constants.h"
+#include "src/webp/types.h"

 //------------------------------------------------------------------------------
 // ALPHDecoder object.

 // Allocates a new alpha decoder instance.
-static ALPHDecoder* ALPHNew(void) {
+WEBP_NODISCARD static ALPHDecoder* ALPHNew(void) {
  ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
  return dec;
 }
@ -45,9 +47,9 @@ static void ALPHDelete(ALPHDecoder* const dec) {
 // header for alpha data stored using lossless compression.
 // Returns false in case of error in alpha header (data too short, invalid
 // compression method or filter, error in lossless header data etc).
-static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
-                    size_t data_size, const VP8Io* const src_io,
-                    uint8_t* output) {
+WEBP_NODISCARD static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                                   size_t data_size, const VP8Io* const src_io,
+                                   uint8_t* output) {
  int ok = 0;
  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
@ -79,7 +81,9 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
  }

  // Copy the necessary parameters from src_io to io
-  VP8InitIo(io);
+  if (!VP8InitIo(io)) {
+    return 0;
+  }
  WebPInitCustomIo(NULL, io);
  io->opaque = dec;
  io->width = src_io->width;
@ -107,7 +111,8 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
 // starting from row number 'row'. It assumes that rows up to (row - 1) have
 // already been decoded.
 // Returns false in case of bitstream error.
-static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+WEBP_NODISCARD static int ALPHDecode(VP8Decoder* const dec, int row,
+                                     int num_rows) {
  ALPHDecoder* const alph_dec = dec->alph_dec_;
  const int width = alph_dec->width_;
  const int height = alph_dec->io_.crop_bottom;
@ -117,21 +122,12 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
    const uint8_t* deltas = dec->alpha_data_ + ALPHA_HEADER_LEN + row * width;
    uint8_t* dst = dec->alpha_plane_ + row * width;
    assert(deltas <= &dec->alpha_data_[dec->alpha_data_size_]);
-    if (alph_dec->filter_ != WEBP_FILTER_NONE) {
-      assert(WebPUnfilters[alph_dec->filter_] != NULL);
-      for (y = 0; y < num_rows; ++y) {
-        WebPUnfilters[alph_dec->filter_](prev_line, deltas, dst, width);
-        prev_line = dst;
-        dst += width;
-        deltas += width;
-      }
-    } else {
-      for (y = 0; y < num_rows; ++y) {
-        memcpy(dst, deltas, width * sizeof(*dst));
-        prev_line = dst;
-        dst += width;
-        deltas += width;
-      }
+    assert(WebPUnfilters[alph_dec->filter_] != NULL);
+    for (y = 0; y < num_rows; ++y) {
+      WebPUnfilters[alph_dec->filter_](prev_line, deltas, dst, width);
+      prev_line = dst;
+      dst += width;
+      deltas += width;
    }
    dec->alpha_prev_line_ = prev_line;
  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
@ -147,7 +143,8 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
  return 1;
 }

-static int AllocateAlphaPlane(VP8Decoder* const dec, const VP8Io* const io) {
+WEBP_NODISCARD static int AllocateAlphaPlane(VP8Decoder* const dec,
+                                             const VP8Io* const io) {
  const int stride = io->width;
  const int height = io->crop_bottom;
  const uint64_t alpha_size = (uint64_t)stride * height;
@ -155,7 +152,8 @@ static int AllocateAlphaPlane(VP8Decoder* const dec, const VP8Io* const io) {
  dec->alpha_plane_mem_ =
      (uint8_t*)WebPSafeMalloc(alpha_size, sizeof(*dec->alpha_plane_));
  if (dec->alpha_plane_mem_ == NULL) {
-    return 0;
+    return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                       "Alpha decoder initialization failed.");
  }
  dec->alpha_plane_ = dec->alpha_plane_mem_;
  dec->alpha_prev_line_ = NULL;
@ -174,9 +172,9 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec) {
 //------------------------------------------------------------------------------
 // Main entry point.

-const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
-                                      const VP8Io* const io,
-                                      int row, int num_rows) {
+WEBP_NODISCARD const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                                     const VP8Io* const io,
+                                                     int row, int num_rows) {
  const int width = io->width;
  const int height = io->crop_bottom;

@ -189,10 +187,19 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
  if (!dec->is_alpha_decoded_) {
    if (dec->alph_dec_ == NULL) {    // Initialize decoder.
      dec->alph_dec_ = ALPHNew();
-      if (dec->alph_dec_ == NULL) return NULL;
+      if (dec->alph_dec_ == NULL) {
+        VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                    "Alpha decoder initialization failed.");
+        return NULL;
+      }
      if (!AllocateAlphaPlane(dec, io)) goto Error;
      if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
                    io, dec->alpha_plane_)) {
+        VP8LDecoder* const vp8l_dec = dec->alph_dec_->vp8l_dec_;
+        VP8SetError(dec,
+                    (vp8l_dec == NULL) ? VP8_STATUS_OUT_OF_MEMORY
+                                       : vp8l_dec->status_,
+                    "Alpha decoder initialization failed.");
        goto Error;
      }
      // if we allowed use of alpha dithering, check whether it's needed at all
--- a/src/dec/buffer_dec.c
+++ b/src/dec/buffer_dec.c
@ -26,10 +26,9 @@ static const uint8_t kModeBpp[MODE_LAST] = {
  4, 4, 4, 2,    // pre-multiplied modes
  1, 1 };

-// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
 // Convert to an integer to handle both the unsigned/signed enum cases
 // without the need for casting to remove type limit warnings.
-static int IsValidColorspace(int webp_csp_mode) {
+int IsValidColorspace(int webp_csp_mode) {
  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }

@ -75,7 +74,7 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
    const uint64_t size =
-        MIN_BUFFER_SIZE(width * kModeBpp[mode], height, stride);
+        MIN_BUFFER_SIZE((uint64_t)width * kModeBpp[mode], height, stride);
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
--- a/src/dec/common_dec.h
+++ b/src/dec/common_dec.h
@ -51,4 +51,7 @@ enum { MB_FEATURE_TREE_PROBS = 3,
       NUM_PROBAS = 11
     };

+// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
+int IsValidColorspace(int webp_csp_mode);
+
 #endif  // WEBP_DEC_COMMON_DEC_H_
--- a/src/dec/idec_dec.c
+++ b/src/dec/idec_dec.c
@ -17,8 +17,10 @@

 #include "src/dec/alphai_dec.h"
 #include "src/dec/webpi_dec.h"
+#include "src/dec/vp8_dec.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/utils/utils.h"
+#include "src/webp/decode.h"

 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@ -161,8 +163,9 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {

 // Appends data to the end of MemBuffer->buf_. It expands the allocated memory
 // size if required and also updates VP8BitReader's if new memory is allocated.
-static int AppendToMemBuffer(WebPIDecoder* const idec,
-                             const uint8_t* const data, size_t data_size) {
+WEBP_NODISCARD static int AppendToMemBuffer(WebPIDecoder* const idec,
+                                            const uint8_t* const data,
+                                            size_t data_size) {
  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  MemBuffer* const mem = &idec->mem_;
  const int need_compressed_alpha = NeedCompressedAlpha(idec);
@ -203,8 +206,9 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
  return 1;
 }

-static int RemapMemBuffer(WebPIDecoder* const idec,
-                          const uint8_t* const data, size_t data_size) {
+WEBP_NODISCARD static int RemapMemBuffer(WebPIDecoder* const idec,
+                                         const uint8_t* const data,
+                                         size_t data_size) {
  MemBuffer* const mem = &idec->mem_;
  const uint8_t* const old_buf = mem->buf_;
  const uint8_t* const old_start =
@ -237,7 +241,8 @@ static void ClearMemBuffer(MemBuffer* const mem) {
  }
 }

-static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
+WEBP_NODISCARD static int CheckMemBufferMode(MemBuffer* const mem,
+                                             MemBufferMode expected) {
  if (mem->mode_ == MEM_MODE_NONE) {
    mem->mode_ = expected;    // switch to the expected mode
  } else if (mem->mode_ != expected) {
@ -248,7 +253,7 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
 }

 // To be called last.
-static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
+WEBP_NODISCARD static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
  const WebPDecoderOptions* const options = idec->params_.options;
  WebPDecBuffer* const output = idec->params_.output;

@ -258,8 +263,10 @@ static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
    if (status != VP8_STATUS_OK) return status;
  }
  if (idec->final_output_ != NULL) {
-    WebPCopyDecBufferPixels(output, idec->final_output_);  // do the slow-copy
+    const VP8StatusCode status = WebPCopyDecBufferPixels(
+        output, idec->final_output_);  // do the slow-copy
    WebPFreeDecBuffer(&idec->output_);
+    if (status != VP8_STATUS_OK) return status;
    *output = *idec->final_output_;
    idec->final_output_ = NULL;
  }
@ -288,7 +295,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
  if (idec->state_ == STATE_VP8_DATA) {
    // Synchronize the thread, clean-up and check for errors.
-    VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+    (void)VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
  }
  idec->state_ = STATE_ERROR;
  return error;
@ -329,6 +336,7 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
    if (dec == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
+    dec->incremental_ = 1;
    idec->dec_ = dec;
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
@ -601,8 +609,9 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Internal constructor

-static WebPIDecoder* NewDecoder(WebPDecBuffer* const output_buffer,
-                                const WebPBitstreamFeatures* const features) {
+WEBP_NODISCARD static WebPIDecoder* NewDecoder(
+    WebPDecBuffer* const output_buffer,
+    const WebPBitstreamFeatures* const features) {
  WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
  if (idec == NULL) {
    return NULL;
@ -614,8 +623,10 @@ static WebPIDecoder* NewDecoder(WebPDecBuffer* const output_buffer,
  idec->last_mb_y_ = -1;

  InitMemBuffer(&idec->mem_);
-  WebPInitDecBuffer(&idec->output_);
-  VP8InitIo(&idec->io_);
+  if (!WebPInitDecBuffer(&idec->output_) || !VP8InitIo(&idec->io_)) {
+    WebPSafeFree(idec);
+    return NULL;
+  }

  WebPResetDecParams(&idec->params_);
  if (output_buffer == NULL || WebPAvoidSlowMemory(output_buffer, features)) {
@ -674,7 +685,8 @@ void WebPIDelete(WebPIDecoder* idec) {
    if (!idec->is_lossless_) {
      if (idec->state_ == STATE_VP8_DATA) {
        // Synchronize the thread, clean-up and check for errors.
-        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+        // TODO(vrabaud) do we care about the return result?
+        (void)VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
      }
      VP8Delete((VP8Decoder*)idec->dec_);
    } else {
@ -851,8 +863,8 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
  return src;
 }

-uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
-                        int* width, int* height, int* stride) {
+WEBP_NODISCARD uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
+                                       int* width, int* height, int* stride) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
  if (src == NULL) return NULL;
  if (src->colorspace >= MODE_YUV) {
@ -867,10 +879,10 @@ uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
  return src->u.RGBA.rgba;
 }

-uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
-                         uint8_t** u, uint8_t** v, uint8_t** a,
-                         int* width, int* height,
-                         int* stride, int* uv_stride, int* a_stride) {
+WEBP_NODISCARD uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                                        uint8_t** u, uint8_t** v, uint8_t** a,
+                                        int* width, int* height, int* stride,
+                                        int* uv_stride, int* a_stride) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
  if (src == NULL) return NULL;
  if (src->colorspace < MODE_YUV) {
--- a/src/dec/io_dec.c
+++ b/src/dec/io_dec.c
@ -12,7 +12,9 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
+#include <stddef.h>
 #include <stdlib.h>
+
 #include "src/dec/vp8i_dec.h"
 #include "src/dec/webpi_dec.h"
 #include "src/dsp/dsp.h"
@ -25,9 +27,9 @@
 static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
  WebPDecBuffer* output = p->output;
  const WebPYUVABuffer* const buf = &output->u.YUVA;
-  uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
-  uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
-  uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
+  uint8_t* const y_dst = buf->y + (ptrdiff_t)io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (ptrdiff_t)(io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (ptrdiff_t)(io->mb_y >> 1) * buf->v_stride;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  const int uv_w = (mb_w + 1) / 2;
@ -42,7 +44,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
  WebPDecBuffer* const output = p->output;
  WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + (ptrdiff_t)io->mb_y * buf->stride;
  WebPSamplerProcessPlane(io->y, io->y_stride,
                          io->u, io->v, io->uv_stride,
                          dst, buf->stride, io->mb_w, io->mb_h,
@ -57,7 +59,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
  int num_lines_out = io->mb_h;   // a priori guess
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
+  uint8_t* dst = buf->rgba + (ptrdiff_t)io->mb_y * buf->stride;
  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
  const uint8_t* cur_y = io->y;
  const uint8_t* cur_u = io->u;
@ -128,7 +130,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
-  uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
+  uint8_t* dst = buf->a + (ptrdiff_t)io->mb_y * buf->a_stride;
  int j;
  (void)expected_num_lines_out;
  assert(expected_num_lines_out == mb_h);
@ -181,8 +183,8 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
    int num_rows;
-    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
-    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + (ptrdiff_t)start_y * buf->stride;
    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
                                            num_rows, dst, buf->stride);
@ -205,8 +207,8 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
    const WEBP_CSP_MODE colorspace = p->output->colorspace;
    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
    int num_rows;
-    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
-    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + (ptrdiff_t)start_y * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
    uint8_t* alpha_dst = base_rgba;
 #else
@ -271,9 +273,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                int expected_num_lines_out) {
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-  uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
+  uint8_t* const dst_a = buf->a + (ptrdiff_t)p->last_y * buf->a_stride;
  if (io->a != NULL) {
-    uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
+    uint8_t* const dst_y = buf->y + (ptrdiff_t)p->last_y * buf->y_stride;
    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
    assert(expected_num_lines_out == num_lines_out);
    if (num_lines_out > 0) {   // unmultiply the Y
@ -362,7 +364,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
  const WebPYUV444Converter convert =
      WebPYUV444Converters[p->output->colorspace];
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
+  uint8_t* dst = buf->rgba + (ptrdiff_t)y_pos * buf->stride;
  int num_lines_out = 0;
  // For RGB rescaling, because of the YUV420, current scan position
  // U/V can be +1/-1 line from the Y one.  Hence the double test.
@ -389,14 +391,14 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
  while (j < mb_h) {
    const int y_lines_in =
        WebPRescalerImport(p->scaler_y, mb_h - j,
-                           io->y + (size_t)j * io->y_stride, io->y_stride);
+                           io->y + (ptrdiff_t)j * io->y_stride, io->y_stride);
    j += y_lines_in;
    if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
      const int u_lines_in = WebPRescalerImport(
-          p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+          p->scaler_u, uv_mb_h - uv_j, io->u + (ptrdiff_t)uv_j * io->uv_stride,
          io->uv_stride);
      const int v_lines_in = WebPRescalerImport(
-          p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+          p->scaler_v, uv_mb_h - uv_j, io->v + (ptrdiff_t)uv_j * io->uv_stride,
          io->uv_stride);
      (void)v_lines_in;   // remove a gcc warning
      assert(u_lines_in == v_lines_in);
@ -409,7 +411,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {

 static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (ptrdiff_t)y_pos * buf->stride;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int alpha_first =
      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
@ -437,7 +439,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                               int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (ptrdiff_t)y_pos * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
  uint8_t* alpha_dst = base_rgba;
 #else
@ -476,7 +478,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
    int lines_left = expected_num_out_lines;
    const int y_end = p->last_y + lines_left;
    while (lines_left > 0) {
-      const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
+      const int64_t row_offset = (ptrdiff_t)scaler->src_y - io->mb_y;
      WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
                         io->a + row_offset * io->width, io->width);
      lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
--- a/src/dec/tree_dec.c
+++ b/src/dec/tree_dec.c
@ -16,7 +16,8 @@
 #include "src/utils/bit_reader_inl_utils.h"

 #if !defined(USE_GENERIC_TREE)
-#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64
+#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64 && \
+    !defined(__wasm__)
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
 #define USE_GENERIC_TREE 1   // ALTERNATE_CODE
 #else
--- a/src/dec/vp8_dec.c
+++ b/src/dec/vp8_dec.c
@ -86,6 +86,8 @@ void VP8Delete(VP8Decoder* const dec) {

 int VP8SetError(VP8Decoder* const dec,
                VP8StatusCode error, const char* const msg) {
+  // VP8_STATUS_SUSPENDED is only meaningful in incremental decoding.
+  assert(dec->incremental_ || error != VP8_STATUS_SUSPENDED);
  // The oldest error reported takes precedence over the new one.
  if (dec->status_ == VP8_STATUS_OK) {
    dec->status_ = error;
@ -190,12 +192,12 @@ static int ParseSegmentHeader(VP8BitReader* br,
 }

 // Paragraph 9.5
-// This function returns VP8_STATUS_SUSPENDED if we don't have all the
-// necessary data in 'buf'.
-// This case is not necessarily an error (for incremental decoding).
-// Still, no bitreader is ever initialized to make it possible to read
-// unavailable memory.
-// If we don't even have the partitions' sizes, than VP8_STATUS_NOT_ENOUGH_DATA
+// If we don't have all the necessary data in 'buf', this function returns
+// VP8_STATUS_SUSPENDED in incremental decoding, VP8_STATUS_NOT_ENOUGH_DATA
+// otherwise.
+// In incremental decoding, this case is not necessarily an error. Still, no
+// bitreader is ever initialized to make it possible to read unavailable memory.
+// If we don't even have the partitions' sizes, then VP8_STATUS_NOT_ENOUGH_DATA
 // is returned, and this is an unrecoverable error.
 // If the partitions were positioned ok, VP8_STATUS_OK is returned.
 static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
@ -225,8 +227,10 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
    sz += 3;
  }
  VP8InitBitReader(dec->parts_ + last_part, part_start, size_left);
-  return (part_start < buf_end) ? VP8_STATUS_OK :
-           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
+  if (part_start < buf_end) return VP8_STATUS_OK;
+  return dec->incremental_
+             ? VP8_STATUS_SUSPENDED  // Init is ok, but there's not enough data
+             : VP8_STATUS_NOT_ENOUGH_DATA;
 }

 // Paragraph 9.4
--- a/src/dec/vp8_dec.h
+++ b/src/dec/vp8_dec.h
@ -15,6 +15,7 @@
 #define WEBP_DEC_VP8_DEC_H_

 #include "src/webp/decode.h"
+#include "src/webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@ -108,16 +109,14 @@ struct VP8Io {
 };

 // Internal, version-checked, entry point
-int VP8InitIoInternal(VP8Io* const, int);
+WEBP_NODISCARD int VP8InitIoInternal(VP8Io* const, int);

 // Set the custom IO function pointers and user-data. The setter for IO hooks
 // should be called before initiating incremental decoding. Returns true if
 // WebPIDecoder object is successfully modified, false otherwise.
-int WebPISetIOHooks(WebPIDecoder* const idec,
-                    VP8IoPutHook put,
-                    VP8IoSetupHook setup,
-                    VP8IoTeardownHook teardown,
-                    void* user_data);
+WEBP_NODISCARD int WebPISetIOHooks(WebPIDecoder* const idec, VP8IoPutHook put,
+                                   VP8IoSetupHook setup,
+                                   VP8IoTeardownHook teardown, void* user_data);

 // Main decoding object. This is an opaque structure.
 typedef struct VP8Decoder VP8Decoder;
@ -128,17 +127,17 @@ VP8Decoder* VP8New(void);
 // Must be called to make sure 'io' is initialized properly.
 // Returns false in case of version mismatch. Upon such failure, no other
 // decoding function should be called (VP8Decode, VP8GetHeaders, ...)
-static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
+WEBP_NODISCARD static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }

 // Decode the VP8 frame header. Returns true if ok.
 // Note: 'io->data' must be pointing to the start of the VP8 frame header.
-int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);

 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
 // Returns false in case of error.
-int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8Decode(VP8Decoder* const dec, VP8Io* const io);

 // Return current status of the decoder:
 VP8StatusCode VP8Status(VP8Decoder* const dec);
--- a/src/dec/vp8i_dec.h
+++ b/src/dec/vp8i_dec.h
@ -21,6 +21,7 @@
 #include "src/utils/random_utils.h"
 #include "src/utils/thread_utils.h"
 #include "src/dsp/dsp.h"
+#include "src/webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@ -31,8 +32,8 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 3
-#define DEC_REV_VERSION 1
+#define DEC_MIN_VERSION 5
+#define DEC_REV_VERSION 0

 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
@ -186,6 +187,7 @@ struct VP8Decoder {

  // Main data source
  VP8BitReader br_;
+  int incremental_;  // if true, incremental decoding is expected

  // headers
  VP8FrameHeader   frm_hdr_;
@ -281,7 +283,7 @@ int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@ -289,7 +291,7 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
-int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
 // Return the multi-threading method to use (0=off), depending
 // on options and bitstream size. Only for lossy decoding.
 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
@ -299,11 +301,12 @@ int VP8GetThreadMethod(const WebPDecoderOptions* const options,
 void VP8InitDithering(const WebPDecoderOptions* const options,
                      VP8Decoder* const dec);
 // Process the last decoded row (filtering + output).
-int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
-int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
+WEBP_NODISCARD int VP8DecodeMB(VP8Decoder* const dec,
+                               VP8BitReader* const token_br);

 // in alpha.c
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@ -12,6 +12,8 @@
 // Authors: Vikas Arora (vikaas.arora@gmail.com)
 //          Jyrki Alakuijala (jyrki@google.com)

+#include <assert.h>
+#include <stddef.h>
 #include <stdlib.h>

 #include "src/dec/alphai_dec.h"
@ -19,10 +21,9 @@
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include "src/dsp/yuv.h"
-#include "src/utils/endian_inl_utils.h"
 #include "src/utils/huffman_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/format_constants.h"

 #define NUM_ARGB_CACHE_ROWS          16

@ -101,6 +102,14 @@ static const uint16_t kTableSize[12] = {
  FIXED_TABLE_SIZE + 2704
 };

+static int VP8LSetError(VP8LDecoder* const dec, VP8StatusCode error) {
+  // The oldest error reported takes precedence over the new one.
+  if (dec->status_ == VP8_STATUS_OK || dec->status_ == VP8_STATUS_SUSPENDED) {
+    dec->status_ = error;
+  }
+  return 0;
+}
+
 static int DecodeImageStream(int xsize, int ysize,
                             int is_level0,
                             VP8LDecoder* const dec,
@ -253,11 +262,11 @@ static int ReadHuffmanCodeLengths(
  int symbol;
  int max_symbol;
  int prev_code_len = DEFAULT_CODE_LENGTH;
-  HuffmanCode table[1 << LENGTHS_TABLE_BITS];
+  HuffmanTables tables;

-  if (!VP8LBuildHuffmanTable(table, LENGTHS_TABLE_BITS,
-                             code_length_code_lengths,
-                             NUM_CODE_LENGTH_CODES)) {
+  if (!VP8LHuffmanTablesAllocate(1 << LENGTHS_TABLE_BITS, &tables) ||
+      !VP8LBuildHuffmanTable(&tables, LENGTHS_TABLE_BITS,
+                             code_length_code_lengths, NUM_CODE_LENGTH_CODES)) {
    goto End;
  }

@ -277,7 +286,7 @@ static int ReadHuffmanCodeLengths(
    int code_len;
    if (max_symbol-- == 0) break;
    VP8LFillBitWindow(br);
-    p = &table[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
+    p = &tables.curr_segment->start[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
    VP8LSetBitPos(br, br->bit_pos_ + p->bits);
    code_len = p->value;
    if (code_len < kCodeLengthLiterals) {
@ -300,14 +309,16 @@ static int ReadHuffmanCodeLengths(
  ok = 1;

 End:
-  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  VP8LHuffmanTablesDeallocate(&tables);
+  if (!ok) return VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
  return ok;
 }

 // 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
 // tree.
 static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
-                           int* const code_lengths, HuffmanCode* const table) {
+                           int* const code_lengths,
+                           HuffmanTables* const table) {
  int ok = 0;
  int size = 0;
  VP8LBitReader* const br = &dec->br_;
@ -331,10 +342,7 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
    int i;
    int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
    const int num_codes = VP8LReadBits(br, 4) + 4;
-    if (num_codes > NUM_CODE_LENGTH_CODES) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-      return 0;
-    }
+    assert(num_codes <= NUM_CODE_LENGTH_CODES);

    for (i = 0; i < num_codes; ++i) {
      code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
@ -349,36 +357,36 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
                                 code_lengths, alphabet_size);
  }
  if (!ok || size == 0) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
  }
  return size;
 }

 static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
                            int color_cache_bits, int allow_recursion) {
-  int i, j;
+  int i;
  VP8LBitReader* const br = &dec->br_;
  VP8LMetadata* const hdr = &dec->hdr_;
  uint32_t* huffman_image = NULL;
  HTreeGroup* htree_groups = NULL;
-  HuffmanCode* huffman_tables = NULL;
-  HuffmanCode* huffman_table = NULL;
+  HuffmanTables* huffman_tables = &hdr->huffman_tables_;
  int num_htree_groups = 1;
  int num_htree_groups_max = 1;
-  int max_alphabet_size = 0;
-  int* code_lengths = NULL;
-  const int table_size = kTableSize[color_cache_bits];
  int* mapping = NULL;
  int ok = 0;

+  // Check the table has been 0 initialized (through InitMetadata).
+  assert(huffman_tables->root.start == NULL);
+  assert(huffman_tables->curr_segment == NULL);
+
  if (allow_recursion && VP8LReadBits(br, 1)) {
    // use meta Huffman codes.
-    const int huffman_precision = VP8LReadBits(br, 3) + 2;
+    const int huffman_precision =
+        MIN_HUFFMAN_BITS + VP8LReadBits(br, NUM_HUFFMAN_BITS);
    const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
    const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
    const int huffman_pixs = huffman_xsize * huffman_ysize;
-    if (!DecodeImageStream(huffman_xsize, huffman_ysize, 0, dec,
+    if (!DecodeImageStream(huffman_xsize, huffman_ysize, /*is_level0=*/0, dec,
                           &huffman_image)) {
      goto Error;
    }
@ -402,7 +410,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
      // values [0, num_htree_groups)
      mapping = (int*)WebPSafeMalloc(num_htree_groups_max, sizeof(*mapping));
      if (mapping == NULL) {
-        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+        VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
        goto Error;
      }
      // -1 means a value is unmapped, and therefore unused in the Huffman
@ -421,29 +429,55 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,

  if (br->eos_) goto Error;

-  // Find maximum alphabet size for the htree group.
-  for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-    int alphabet_size = kAlphabetSize[j];
-    if (j == 0 && color_cache_bits > 0) {
-      alphabet_size += 1 << color_cache_bits;
-    }
-    if (max_alphabet_size < alphabet_size) {
-      max_alphabet_size = alphabet_size;
-    }
+  if (!ReadHuffmanCodesHelper(color_cache_bits, num_htree_groups,
+                              num_htree_groups_max, mapping, dec,
+                              huffman_tables, &htree_groups)) {
+    goto Error;
  }
+  ok = 1;

-  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
-                                      sizeof(*code_lengths));
-  huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size,
-                                                sizeof(*huffman_tables));
-  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+  // All OK. Finalize pointers.
+  hdr->huffman_image_ = huffman_image;
+  hdr->num_htree_groups_ = num_htree_groups;
+  hdr->htree_groups_ = htree_groups;

-  if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+ Error:
+  WebPSafeFree(mapping);
+  if (!ok) {
+    WebPSafeFree(huffman_image);
+    VP8LHuffmanTablesDeallocate(huffman_tables);
+    VP8LHtreeGroupsFree(htree_groups);
+  }
+  return ok;
+}
+
+int ReadHuffmanCodesHelper(int color_cache_bits, int num_htree_groups,
+                           int num_htree_groups_max, const int* const mapping,
+                           VP8LDecoder* const dec,
+                           HuffmanTables* const huffman_tables,
+                           HTreeGroup** const htree_groups) {
+  int i, j, ok = 0;
+  const int max_alphabet_size =
+      kAlphabetSize[0] + ((color_cache_bits > 0) ? 1 << color_cache_bits : 0);
+  const int table_size = kTableSize[color_cache_bits];
+  int* code_lengths = NULL;
+
+  if ((mapping == NULL && num_htree_groups != num_htree_groups_max) ||
+      num_htree_groups > num_htree_groups_max) {
+    goto Error;
+  }
+
+  code_lengths =
+      (int*)WebPSafeCalloc((uint64_t)max_alphabet_size, sizeof(*code_lengths));
+  *htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+
+  if (*htree_groups == NULL || code_lengths == NULL ||
+      !VP8LHuffmanTablesAllocate(num_htree_groups * table_size,
+                                 huffman_tables)) {
+    VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
    goto Error;
  }

-  huffman_table = huffman_tables;
  for (i = 0; i < num_htree_groups_max; ++i) {
    // If the index "i" is unused in the Huffman image, just make sure the
    // coefficients are valid but do not store them.
@ -460,7 +494,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
      }
    } else {
      HTreeGroup* const htree_group =
-          &htree_groups[(mapping == NULL) ? i : mapping[i]];
+          &(*htree_groups)[(mapping == NULL) ? i : mapping[i]];
      HuffmanCode** const htrees = htree_group->htrees;
      int size;
      int total_size = 0;
@ -468,19 +502,20 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
      int max_bits = 0;
      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
        int alphabet_size = kAlphabetSize[j];
-        htrees[j] = huffman_table;
        if (j == 0 && color_cache_bits > 0) {
          alphabet_size += (1 << color_cache_bits);
        }
-        size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table);
+        size =
+            ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables);
+        htrees[j] = huffman_tables->curr_segment->curr_table;
        if (size == 0) {
          goto Error;
        }
        if (is_trivial_literal && kLiteralMap[j] == 1) {
-          is_trivial_literal = (huffman_table->bits == 0);
+          is_trivial_literal = (htrees[j]->bits == 0);
        }
-        total_size += huffman_table->bits;
-        huffman_table += size;
+        total_size += htrees[j]->bits;
+        huffman_tables->curr_segment->curr_table += size;
        if (j <= ALPHA) {
          int local_max_bits = code_lengths[0];
          int k;
@ -511,19 +546,12 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
  }
  ok = 1;

-  // All OK. Finalize pointers.
-  hdr->huffman_image_ = huffman_image;
-  hdr->num_htree_groups_ = num_htree_groups;
-  hdr->htree_groups_ = htree_groups;
-  hdr->huffman_tables_ = huffman_tables;
-
 Error:
  WebPSafeFree(code_lengths);
-  WebPSafeFree(mapping);
  if (!ok) {
-    WebPSafeFree(huffman_image);
-    WebPSafeFree(huffman_tables);
-    VP8LHtreeGroupsFree(htree_groups);
+    VP8LHuffmanTablesDeallocate(huffman_tables);
+    VP8LHtreeGroupsFree(*htree_groups);
+    *htree_groups = NULL;
  }
  return ok;
 }
@ -547,8 +575,7 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
                               scaled_data_size * sizeof(*scaled_data);
  uint8_t* memory = (uint8_t*)WebPSafeMalloc(memory_size, sizeof(*memory));
  if (memory == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
  }
  assert(dec->rescaler_memory == NULL);
  dec->rescaler_memory = memory;
@ -598,8 +625,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
  int num_lines_in = 0;
  int num_lines_out = 0;
  while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + (uint64_t)num_lines_in * in_stride;
-    uint8_t* const row_out = out + (uint64_t)num_lines_out * out_stride;
+    uint8_t* const row_in = in + (ptrdiff_t)num_lines_in * in_stride;
+    uint8_t* const row_out = out + (ptrdiff_t)num_lines_out * out_stride;
    const int lines_left = mb_h - num_lines_in;
    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
    int lines_imported;
@ -801,7 +828,7 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
      if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
        const WebPRGBABuffer* const buf = &output->u.RGBA;
        uint8_t* const rgba =
-            buf->rgba + (int64_t)dec->last_out_row_ * buf->stride;
+            buf->rgba + (ptrdiff_t)dec->last_out_row_ * buf->stride;
        const int num_rows_out =
 #if !defined(WEBP_REDUCE_SIZE)
         io->use_scaling ?
@ -1082,12 +1109,10 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
 End:
  br->eos_ = VP8LIsEndOfStream(br);
  if (!ok || (br->eos_ && pos < end)) {
-    ok = 0;
-    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
-                            : VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    dec->last_pixel_ = pos;
+    return VP8LSetError(
+        dec, br->eos_ ? VP8_STATUS_SUSPENDED : VP8_STATUS_BITSTREAM_ERROR);
  }
+  dec->last_pixel_ = pos;
  return ok;
 }

@ -1237,9 +1262,20 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
  }

  br->eos_ = VP8LIsEndOfStream(br);
-  if (dec->incremental_ && br->eos_ && src < src_end) {
+  // In incremental decoding:
+  // br->eos_ && src < src_last: if 'br' reached the end of the buffer and
+  // 'src_last' has not been reached yet, there is not enough data. 'dec' has to
+  // be reset until there is more data.
+  // !br->eos_ && src < src_last: this cannot happen as either the buffer is
+  // fully read, either enough has been read to reach 'src_last'.
+  // src >= src_last: 'src_last' is reached, all is fine. 'src' can actually go
+  // beyond 'src_last' in case the image is cropped and an LZ77 goes further.
+  // The buffer might have been enough or there is some left. 'br->eos_' does
+  // not matter.
+  assert(!dec->incremental_ || (br->eos_ && src < src_last) || src >= src_last);
+  if (dec->incremental_ && br->eos_ && src < src_last) {
    RestoreState(dec);
-  } else if (!br->eos_) {
+  } else if ((dec->incremental_ && src >= src_last) || !br->eos_) {
    // Process the remaining rows corresponding to last row-block.
    if (process_func != NULL) {
      process_func(dec, row > last_row ? last_row : row);
@ -1254,8 +1290,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
  return 1;

 Error:
-  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-  return 0;
+  return VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
 }

 // -----------------------------------------------------------------------------
@ -1317,12 +1352,13 @@ static int ReadTransform(int* const xsize, int const* ysize,
  switch (type) {
    case PREDICTOR_TRANSFORM:
    case CROSS_COLOR_TRANSFORM:
-      transform->bits_ = VP8LReadBits(br, 3) + 2;
+      transform->bits_ =
+          MIN_TRANSFORM_BITS + VP8LReadBits(br, NUM_TRANSFORM_BITS);
      ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
                                               transform->bits_),
                             VP8LSubSampleSize(transform->ysize_,
                                               transform->bits_),
-                             0, dec, &transform->data_);
+                             /*is_level0=*/0, dec, &transform->data_);
      break;
    case COLOR_INDEXING_TRANSFORM: {
       const int num_colors = VP8LReadBits(br, 8) + 1;
@ -1332,8 +1368,11 @@ static int ReadTransform(int* const xsize, int const* ysize,
                      : 3;
       *xsize = VP8LSubSampleSize(transform->xsize_, bits);
       transform->bits_ = bits;
-       ok = DecodeImageStream(num_colors, 1, 0, dec, &transform->data_);
-       ok = ok && ExpandColorMap(num_colors, transform);
+       ok = DecodeImageStream(num_colors, /*ysize=*/1, /*is_level0=*/0, dec,
+                              &transform->data_);
+       if (ok && !ExpandColorMap(num_colors, transform)) {
+         return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
+       }
      break;
    }
    case SUBTRACT_GREEN_TRANSFORM:
@ -1358,7 +1397,7 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
  assert(hdr != NULL);

  WebPSafeFree(hdr->huffman_image_);
-  WebPSafeFree(hdr->huffman_tables_);
+  VP8LHuffmanTablesDeallocate(&hdr->huffman_tables_);
  VP8LHtreeGroupsFree(hdr->htree_groups_);
  VP8LColorCacheClear(&hdr->color_cache_);
  VP8LColorCacheClear(&hdr->saved_color_cache_);
@ -1379,7 +1418,9 @@ VP8LDecoder* VP8LNew(void) {
  return dec;
 }

-void VP8LClear(VP8LDecoder* const dec) {
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+static void VP8LClear(VP8LDecoder* const dec) {
  int i;
  if (dec == NULL) return;
  ClearMetadata(&dec->hdr_);
@ -1439,7 +1480,7 @@ static int DecodeImageStream(int xsize, int ysize,
    color_cache_bits = VP8LReadBits(br, 4);
    ok = (color_cache_bits >= 1 && color_cache_bits <= MAX_CACHE_BITS);
    if (!ok) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
      goto End;
    }
  }
@ -1448,7 +1489,7 @@ static int DecodeImageStream(int xsize, int ysize,
  ok = ok && ReadHuffmanCodes(dec, transform_xsize, transform_ysize,
                              color_cache_bits, is_level0);
  if (!ok) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
    goto End;
  }

@ -1456,8 +1497,7 @@ static int DecodeImageStream(int xsize, int ysize,
  if (color_cache_bits > 0) {
    hdr->color_cache_size_ = 1 << color_cache_bits;
    if (!VP8LColorCacheInit(&hdr->color_cache_, color_cache_bits)) {
-      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-      ok = 0;
+      ok = VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
      goto End;
    }
  } else {
@ -1474,8 +1514,7 @@ static int DecodeImageStream(int xsize, int ysize,
    const uint64_t total_size = (uint64_t)transform_xsize * transform_ysize;
    data = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*data));
    if (data == NULL) {
-      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-      ok = 0;
+      ok = VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
      goto End;
    }
  }
@ -1520,8 +1559,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
  if (dec->pixels_ == NULL) {
    dec->argb_cache_ = NULL;    // for soundness
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
  }
  dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
  return 1;
@ -1532,8 +1570,7 @@ static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
  dec->argb_cache_ = NULL;    // for soundness
  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
  if (dec->pixels_ == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
  }
  return 1;
 }
@ -1588,7 +1625,8 @@ int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
  dec->status_ = VP8_STATUS_OK;
  VP8LInitBitReader(&dec->br_, data, data_size);

-  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
+  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, /*is_level0=*/1,
+                         dec, /*decoded_data=*/NULL)) {
    goto Err;
  }

@ -1643,22 +1681,24 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {

  if (dec == NULL) return 0;
  if (io == NULL) {
-    dec->status_ = VP8_STATUS_INVALID_PARAM;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_INVALID_PARAM);
  }

  dec->io_ = io;
  dec->status_ = VP8_STATUS_OK;
  VP8LInitBitReader(&dec->br_, io->data, io->data_size);
  if (!ReadImageInfo(&dec->br_, &width, &height, &has_alpha)) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
    goto Error;
  }
  dec->state_ = READ_DIM;
  io->width = width;
  io->height = height;

-  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Error;
+  if (!DecodeImageStream(width, height, /*is_level0=*/1, dec,
+                         /*decoded_data=*/NULL)) {
+    goto Error;
+  }
  return 1;

 Error:
@ -1673,7 +1713,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

  if (dec == NULL) return 0;

-  assert(dec->hdr_.huffman_tables_ != NULL);
+  assert(dec->hdr_.huffman_tables_.root.start != NULL);
  assert(dec->hdr_.htree_groups_ != NULL);
  assert(dec->hdr_.num_htree_groups_ > 0);

@ -1688,7 +1728,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
    assert(dec->output_ != NULL);

    if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
-      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      VP8LSetError(dec, VP8_STATUS_INVALID_PARAM);
      goto Err;
    }

@ -1698,7 +1738,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 #else
    if (io->use_scaling) {
-      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      VP8LSetError(dec, VP8_STATUS_INVALID_PARAM);
      goto Err;
    }
 #endif
@ -1716,7 +1756,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
          dec->hdr_.saved_color_cache_.colors_ == NULL) {
        if (!VP8LColorCacheInit(&dec->hdr_.saved_color_cache_,
                                dec->hdr_.color_cache_.hash_bits_)) {
-          dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+          VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
          goto Err;
        }
      }
--- a/src/dec/vp8li_dec.h
+++ b/src/dec/vp8li_dec.h
@ -20,6 +20,7 @@
 #include "src/utils/bit_reader_utils.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/huffman_utils.h"
+#include "src/webp/types.h"

 #ifdef __cplusplus
 extern "C" {
@ -51,7 +52,7 @@ typedef struct {
  uint32_t*       huffman_image_;
  int             num_htree_groups_;
  HTreeGroup*     htree_groups_;
-  HuffmanCode*    huffman_tables_;
+  HuffmanTables   huffman_tables_;
 } VP8LMetadata;

 typedef struct VP8LDecoder VP8LDecoder;
@ -99,33 +100,42 @@ struct ALPHDecoder;  // Defined in dec/alphai.h.

 // Decodes image header for alpha data stored using lossless compression.
 // Returns false in case of error.
-int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size);
+WEBP_NODISCARD int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                                         const uint8_t* const data,
+                                         size_t data_size);

 // Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
 // already decoded in previous call(s), it will resume decoding from where it
 // was paused.
 // Returns false in case of bitstream error.
-int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
-                               int last_row);
+WEBP_NODISCARD int VP8LDecodeAlphaImageStream(
+    struct ALPHDecoder* const alph_dec, int last_row);

 // Allocates and initialize a new lossless decoder instance.
-VP8LDecoder* VP8LNew(void);
+WEBP_NODISCARD VP8LDecoder* VP8LNew(void);

 // Decodes the image header. Returns false in case of error.
-int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);

 // Decodes an image. It's required to decode the lossless header before calling
 // this function. Returns false in case of error, with updated dec->status_.
-int VP8LDecodeImage(VP8LDecoder* const dec);
-
-// Resets the decoder in its initial state, reclaiming memory.
-// Preserves the dec->status_ value.
-void VP8LClear(VP8LDecoder* const dec);
+WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec);

 // Clears and deallocate a lossless decoder instance.
 void VP8LDelete(VP8LDecoder* const dec);

+// Helper function for reading the different Huffman codes and storing them in
+// 'huffman_tables' and 'htree_groups'.
+// If mapping is NULL 'num_htree_groups_max' must equal 'num_htree_groups'.
+// If it is not NULL, it maps 'num_htree_groups_max' indices to the
+// 'num_htree_groups' groups. If 'num_htree_groups_max' > 'num_htree_groups',
+// some of those indices map to -1. This is used for non-balanced codes to
+// limit memory usage.
+WEBP_NODISCARD int ReadHuffmanCodesHelper(
+    int color_cache_bits, int num_htree_groups, int num_htree_groups_max,
+    const int* const mapping, VP8LDecoder* const dec,
+    HuffmanTables* const huffman_tables, HTreeGroup** const htree_groups);
+
 //------------------------------------------------------------------------------

 #ifdef __cplusplus
--- a/src/dec/webp_dec.c
+++ b/src/dec/webp_dec.c
@ -13,11 +13,16 @@

 #include <stdlib.h>

+#include "src/dec/common_dec.h"
+#include "src/dec/vp8_dec.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/dec/vp8li_dec.h"
 #include "src/dec/webpi_dec.h"
+#include "src/utils/rescaler_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/decode.h"
 #include "src/webp/mux_types.h"  // ALPHA_FLAG
+#include "src/webp/types.h"

 //------------------------------------------------------------------------------
 // RIFF layout is:
@ -444,8 +449,9 @@ void WebPResetDecParams(WebPDecParams* const params) {
 // "Into" decoding variants

 // Main flow
-static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
-                                WebPDecParams* const params) {
+WEBP_NODISCARD static VP8StatusCode DecodeInto(const uint8_t* const data,
+                                               size_t data_size,
+                                               WebPDecParams* const params) {
  VP8StatusCode status;
  VP8Io io;
  WebPHeaderStructure headers;
@ -459,7 +465,9 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
  }

  assert(params != NULL);
-  VP8InitIo(&io);
+  if (!VP8InitIo(&io)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
  io.data = headers.data + headers.offset;
  io.data_size = headers.data_size - headers.offset;
  WebPInitCustomIo(params, &io);  // Plug the I/O functions.
@ -523,17 +531,16 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
 }

 // Helpers
-static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
-                                     const uint8_t* const data,
-                                     size_t data_size,
-                                     uint8_t* const rgba,
-                                     int stride, size_t size) {
+WEBP_NODISCARD static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
+                                                    const uint8_t* const data,
+                                                    size_t data_size,
+                                                    uint8_t* const rgba,
+                                                    int stride, size_t size) {
  WebPDecParams params;
  WebPDecBuffer buf;
-  if (rgba == NULL) {
+  if (rgba == NULL || !WebPInitDecBuffer(&buf)) {
    return NULL;
  }
-  WebPInitDecBuffer(&buf);
  WebPResetDecParams(&params);
  params.output = &buf;
  buf.colorspace    = colorspace;
@ -578,8 +585,7 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
                           uint8_t* v, size_t v_size, int v_stride) {
  WebPDecParams params;
  WebPDecBuffer output;
-  if (luma == NULL) return NULL;
-  WebPInitDecBuffer(&output);
+  if (luma == NULL || !WebPInitDecBuffer(&output)) return NULL;
  WebPResetDecParams(&params);
  params.output = &output;
  output.colorspace      = MODE_YUV;
@ -601,13 +607,17 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,

 //------------------------------------------------------------------------------

-static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
-                       size_t data_size, int* const width, int* const height,
-                       WebPDecBuffer* const keep_info) {
+WEBP_NODISCARD static uint8_t* Decode(WEBP_CSP_MODE mode,
+                                      const uint8_t* const data,
+                                      size_t data_size, int* const width,
+                                      int* const height,
+                                      WebPDecBuffer* const keep_info) {
  WebPDecParams params;
  WebPDecBuffer output;

-  WebPInitDecBuffer(&output);
+  if (!WebPInitDecBuffer(&output)) {
+    return NULL;
+  }
  WebPResetDecParams(&params);
  params.output = &output;
  output.colorspace = mode;
@ -733,7 +743,64 @@ int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
  }
  memset(config, 0, sizeof(*config));
  DefaultFeatures(&config->input);
-  WebPInitDecBuffer(&config->output);
+  if (!WebPInitDecBuffer(&config->output)) {
+    return 0;
+  }
+  return 1;
+}
+
+static int WebPCheckCropDimensionsBasic(int x, int y, int w, int h) {
+  return !(x < 0 || y < 0 || w <= 0 || h <= 0);
+}
+
+int WebPValidateDecoderConfig(const WebPDecoderConfig* config) {
+  const WebPDecoderOptions* options;
+  if (config == NULL) return 0;
+  if (!IsValidColorspace(config->output.colorspace)) {
+    return 0;
+  }
+
+  options = &config->options;
+  // bypass_filtering, no_fancy_upsampling, use_cropping, use_scaling,
+  // use_threads, flip can be any integer and are interpreted as boolean.
+
+  // Check for cropping.
+  if (options->use_cropping && !WebPCheckCropDimensionsBasic(
+                                   options->crop_left, options->crop_top,
+                                   options->crop_width, options->crop_height)) {
+    return 0;
+  }
+  // Check for scaling.
+  if (options->use_scaling &&
+      (options->scaled_width < 0 || options->scaled_height < 0 ||
+       (options->scaled_width == 0 && options->scaled_height == 0))) {
+    return 0;
+  }
+
+  // In case the WebPBitstreamFeatures has been filled in, check further.
+  if (config->input.width > 0 || config->input.height > 0) {
+    int scaled_width = options->scaled_width;
+    int scaled_height = options->scaled_height;
+    if (options->use_cropping &&
+        !WebPCheckCropDimensions(config->input.width, config->input.height,
+                                 options->crop_left, options->crop_top,
+                                 options->crop_width, options->crop_height)) {
+      return 0;
+    }
+    if (options->use_scaling && !WebPRescalerGetScaledDimensions(
+                                    config->input.width, config->input.height,
+                                    &scaled_width, &scaled_height)) {
+      return 0;
+    }
+  }
+
+  // Check for dithering.
+  if (options->dithering_strength < 0 || options->dithering_strength > 100 ||
+      options->alpha_dithering_strength < 0 ||
+      options->alpha_dithering_strength > 100) {
+    return 0;
+  }
+
  return 1;
 }

@ -772,7 +839,9 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
  if (WebPAvoidSlowMemory(params.output, &config->input)) {
    // decoding to slow memory: use a temporary in-mem buffer to decode into.
    WebPDecBuffer in_mem_buffer;
-    WebPInitDecBuffer(&in_mem_buffer);
+    if (!WebPInitDecBuffer(&in_mem_buffer)) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
    in_mem_buffer.colorspace = config->output.colorspace;
    in_mem_buffer.width = config->input.width;
    in_mem_buffer.height = config->input.height;
@ -794,8 +863,8 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,

 int WebPCheckCropDimensions(int image_width, int image_height,
                            int x, int y, int w, int h) {
-  return !(x < 0 || y < 0 || w <= 0 || h <= 0 ||
-           x >= image_width || w > image_width || w > image_width - x ||
+  return WebPCheckCropDimensionsBasic(x, y, w, h) &&
+         !(x >= image_width || w > image_width || w > image_width - x ||
           y >= image_height || h > image_height || h > image_height - y);
 }

--- a/src/dec/webpi_dec.h
+++ b/src/dec/webpi_dec.h
@ -20,6 +20,7 @@ extern "C" {

 #include "src/utils/rescaler_utils.h"
 #include "src/dec/vp8_dec.h"
+#include "src/webp/decode.h"

 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@ -87,8 +88,9 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);

 // Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
 // to the *compressed* format, not the output one.
-int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
-                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
+WEBP_NODISCARD int WebPIoInitFromOptions(
+    const WebPDecoderOptions* const options, VP8Io* const io,
+    WEBP_CSP_MODE src_colorspace);

 //------------------------------------------------------------------------------
 // Internal functions regarding WebPDecBuffer memory (in buffer.c).
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -13,6 +13,6 @@ noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:13:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:16:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/anim_decode.c
+++ b/src/demux/anim_decode.c
@ -20,6 +20,7 @@
 #include "src/utils/utils.h"
 #include "src/webp/decode.h"
 #include "src/webp/demux.h"
+#include "src/webp/types.h"

 #define NUM_CHANNELS 4

@ -68,8 +69,9 @@ int WebPAnimDecoderOptionsInitInternal(WebPAnimDecoderOptions* dec_options,
  return 1;
 }

-static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
-                               WebPAnimDecoder* const dec) {
+WEBP_NODISCARD static int ApplyDecoderOptions(
+    const WebPAnimDecoderOptions* const dec_options,
+    WebPAnimDecoder* const dec) {
  WEBP_CSP_MODE mode;
  WebPDecoderConfig* config = &dec->config_;
  assert(dec_options != NULL);
@ -82,7 +84,9 @@ static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
  dec->blend_func_ = (mode == MODE_RGBA || mode == MODE_BGRA)
                         ? &BlendPixelRowNonPremult
                         : &BlendPixelRowPremult;
-  WebPInitDecoderConfig(config);
+  if (!WebPInitDecoderConfig(config)) {
+    return 0;
+  }
  config->output.colorspace = mode;
  config->output.is_external_memory = 1;
  config->options.use_threads = dec_options->use_threads;
@ -157,8 +161,8 @@ static int IsFullFrame(int width, int height, int canvas_width,
 }

 // Clear the canvas to transparent.
-static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
-                          uint32_t canvas_height) {
+WEBP_NODISCARD static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
+                                         uint32_t canvas_height) {
  const uint64_t size =
      (uint64_t)canvas_width * canvas_height * NUM_CHANNELS * sizeof(*buf);
  if (!CheckSizeOverflow(size)) return 0;
@ -179,8 +183,8 @@ static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
 }

 // Copy width * height pixels from 'src' to 'dst'.
-static int CopyCanvas(const uint8_t* src, uint8_t* dst,
-                      uint32_t width, uint32_t height) {
+WEBP_NODISCARD static int CopyCanvas(const uint8_t* src, uint8_t* dst,
+                                     uint32_t width, uint32_t height) {
  const uint64_t size = (uint64_t)width * height * NUM_CHANNELS;
  if (!CheckSizeOverflow(size)) return 0;
  assert(src != NULL && dst != NULL);
@ -424,7 +428,9 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
  WebPDemuxReleaseIterator(&dec->prev_iter_);
  dec->prev_iter_ = iter;
  dec->prev_frame_was_keyframe_ = is_key_frame;
-  CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
+  if (!CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height)) {
+    goto Error;
+  }
  if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
    ZeroFillFrameRect(dec->prev_frame_disposed_, width * NUM_CHANNELS,
                      dec->prev_iter_.x_offset, dec->prev_iter_.y_offset,
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -24,8 +24,8 @@
 #include "src/webp/format_constants.h"

 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 1
+#define DMUX_MIN_VERSION 5
+#define DMUX_REV_VERSION 0

 typedef struct {
  size_t start_;        // start location of the data
--- a/src/demux/libwebpdemux.rc
+++ b/src/demux/libwebpdemux.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,0,3,1
- PRODUCTVERSION 1,0,3,1
+ FILEVERSION 1,0,5,0
+ PRODUCTVERSION 1,0,5,0
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebpdemux DLL"
-            VALUE "FileVersion", "1.3.1"
+            VALUE "FileVersion", "1.5.0"
            VALUE "InternalName", "libwebpdemux.dll"
-            VALUE "LegalCopyright", "Copyright (C) 2023"
+            VALUE "LegalCopyright", "Copyright (C) 2024"
            VALUE "OriginalFilename", "libwebpdemux.dll"
            VALUE "ProductName", "WebP Image Demuxer"
-            VALUE "ProductVersion", "1.3.1"
+            VALUE "ProductVersion", "1.5.0"
        END
    END
    BLOCK "VarFileInfo"
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -5,6 +5,8 @@ noinst_LTLIBRARIES += libwebpdsp_sse2.la
 noinst_LTLIBRARIES += libwebpdspdecode_sse2.la
 noinst_LTLIBRARIES += libwebpdsp_sse41.la
 noinst_LTLIBRARIES += libwebpdspdecode_sse41.la
+noinst_LTLIBRARIES += libwebpdsp_avx2.la
+noinst_LTLIBRARIES += libwebpdspdecode_avx2.la
 noinst_LTLIBRARIES += libwebpdsp_neon.la
 noinst_LTLIBRARIES += libwebpdspdecode_neon.la
 noinst_LTLIBRARIES += libwebpdsp_msa.la
@ -44,6 +46,11 @@ ENC_SOURCES += lossless_enc.c
 ENC_SOURCES += quant.h
 ENC_SOURCES += ssim.c

+libwebpdspdecode_avx2_la_SOURCES =
+libwebpdspdecode_avx2_la_SOURCES += lossless_avx2.c
+libwebpdspdecode_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdspdecode_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
+
 libwebpdspdecode_sse41_la_SOURCES =
 libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c
 libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c
@ -123,6 +130,12 @@ libwebpdsp_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
 libwebpdsp_sse41_la_LIBADD = libwebpdspdecode_sse41.la

+libwebpdsp_avx2_la_SOURCES =
+libwebpdsp_avx2_la_SOURCES += lossless_enc_avx2.c
+libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
+libwebpdsp_avx2_la_LIBADD = libwebpdspdecode_avx2.la
+
 libwebpdsp_neon_la_SOURCES =
 libwebpdsp_neon_la_SOURCES += cost_neon.c
 libwebpdsp_neon_la_SOURCES += enc_neon.c
@ -167,6 +180,7 @@ libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_LIBADD =
 libwebpdsp_la_LIBADD += libwebpdsp_sse2.la
 libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
+libwebpdsp_la_LIBADD += libwebpdsp_avx2.la
 libwebpdsp_la_LIBADD += libwebpdsp_neon.la
 libwebpdsp_la_LIBADD += libwebpdsp_msa.la
 libwebpdsp_la_LIBADD += libwebpdsp_mips32.la
@ -180,6 +194,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_LIBADD =
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_avx2.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_neon.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_msa.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_mips32.la
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -144,6 +144,46 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
  return (alpha_and == 0xff);
 }

+static void ExtractGreen_SSE2(const uint32_t* WEBP_RESTRICT argb,
+                              uint8_t* WEBP_RESTRICT alpha, int size) {
+  int i;
+  const __m128i mask = _mm_set1_epi32(0xff);
+  const __m128i* src = (const __m128i*)argb;
+
+  for (i = 0; i + 16 <= size; i += 16, src += 4) {
+    const __m128i a0 = _mm_loadu_si128(src + 0);
+    const __m128i a1 = _mm_loadu_si128(src + 1);
+    const __m128i a2 = _mm_loadu_si128(src + 2);
+    const __m128i a3 = _mm_loadu_si128(src + 3);
+    const __m128i b0 = _mm_srli_epi32(a0, 8);
+    const __m128i b1 = _mm_srli_epi32(a1, 8);
+    const __m128i b2 = _mm_srli_epi32(a2, 8);
+    const __m128i b3 = _mm_srli_epi32(a3, 8);
+    const __m128i c0 = _mm_and_si128(b0, mask);
+    const __m128i c1 = _mm_and_si128(b1, mask);
+    const __m128i c2 = _mm_and_si128(b2, mask);
+    const __m128i c3 = _mm_and_si128(b3, mask);
+    const __m128i d0 = _mm_packs_epi32(c0, c1);
+    const __m128i d1 = _mm_packs_epi32(c2, c3);
+    const __m128i e = _mm_packus_epi16(d0, d1);
+    // store
+    _mm_storeu_si128((__m128i*)&alpha[i], e);
+  }
+  if (i + 8 <= size) {
+    const __m128i a0 = _mm_loadu_si128(src + 0);
+    const __m128i a1 = _mm_loadu_si128(src + 1);
+    const __m128i b0 = _mm_srli_epi32(a0, 8);
+    const __m128i b1 = _mm_srli_epi32(a1, 8);
+    const __m128i c0 = _mm_and_si128(b0, mask);
+    const __m128i c1 = _mm_and_si128(b1, mask);
+    const __m128i d = _mm_packs_epi32(c0, c1);
+    const __m128i e = _mm_packus_epi16(d, d);
+    _mm_storel_epi64((__m128i*)&alpha[i], e);
+    i += 8;
+  }
+  for (; i < size; ++i) alpha[i] = argb[i] >> 8;
+}
+
 //------------------------------------------------------------------------------
 // Non-dither premultiplied modes

@ -354,6 +394,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
  WebPDispatchAlpha = DispatchAlpha_SSE2;
  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
  WebPExtractAlpha = ExtractAlpha_SSE2;
+  WebPExtractGreen = ExtractGreen_SSE2;

  WebPHasAlpha8b = HasAlpha8b_SSE2;
  WebPHasAlpha32b = HasAlpha32b_SSE2;
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
  return cost;
 }

-static void SetResidualCoeffs_C(const int16_t* const coeffs,
-                                VP8Residual* const res) {
+static void SetResidualCoeffs_C(const int16_t* WEBP_RESTRICT const coeffs,
+                                VP8Residual* WEBP_RESTRICT const res) {
  int n;
  res->last = -1;
  assert(res->first == 0 || coeffs[0] == 0);
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
  return cost;
 }

-static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
-                                     VP8Residual* const res) {
+static void SetResidualCoeffs_MIPS32(const int16_t* WEBP_RESTRICT const coeffs,
+                                     VP8Residual* WEBP_RESTRICT const res) {
  const int16_t* p_coeffs = (int16_t*)coeffs;
  int temp0, temp1, temp2, n, n1;
  assert(res->first == 0 || coeffs[0] == 0);
--- a/src/dsp/cost_neon.c
+++ b/src/dsp/cost_neon.c
@ -19,8 +19,8 @@
 static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
                                      9, 10, 11, 12, 13, 14, 15, 16 };

-static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
-                                   VP8Residual* const res) {
+static void SetResidualCoeffs_NEON(const int16_t* WEBP_RESTRICT const coeffs,
+                                   VP8Residual* WEBP_RESTRICT const res) {
  const int16x8_t minus_one = vdupq_n_s16(-1);
  const int16x8_t coeffs_0 = vld1q_s16(coeffs);
  const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@ -22,8 +22,8 @@

 //------------------------------------------------------------------------------

-static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
-                                   VP8Residual* const res) {
+static void SetResidualCoeffs_SSE2(const int16_t* WEBP_RESTRICT const coeffs,
+                                   VP8Residual* WEBP_RESTRICT const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE2 to compare 16 values with a single instruction.
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -36,18 +36,6 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type), "c"(0));
 }
-#elif defined(__x86_64__) && \
-      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
-      defined(__PIC__)
-static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "xchg{q}\t{%%rbx}, %q1\n"
-    "cpuid\n"
-    "xchg{q}\t{%%rbx}, %q1\n"
-    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
-      "=d"(cpu_info[3])
-    : "a"(info_type), "c"(0));
-}
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
--- a/src/dsp/cpu.h
+++ b/src/dsp/cpu.h
@ -56,6 +56,11 @@
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_AVX2  // Visual C++ AVX2 targets
+#endif
 #endif

 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
@ -80,6 +85,16 @@
 #define WEBP_HAVE_SSE41
 #endif

+#if (defined(__AVX2__) || defined(WEBP_MSC_AVX2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_AVX2))
+#define WEBP_USE_AVX2
+#endif
+
+#if defined(WEBP_USE_AVX2) && !defined(WEBP_HAVE_AVX2)
+#define WEBP_HAVE_AVX2
+#endif
+
+#undef WEBP_MSC_AVX2
 #undef WEBP_MSC_SSE41
 #undef WEBP_MSC_SSE2

--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -37,19 +37,19 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
  STORE(3, y, DC - (d));            \
 } while (0)

-#define MUL1(a) ((((a) * 20091) >> 16) + (a))
-#define MUL2(a) (((a) * 35468) >> 16)
-
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformOne_C(const int16_t* in, uint8_t* dst) {
+static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
  for (i = 0; i < 4; ++i) {    // vertical pass
    const int a = in[0] + in[8];    // [-4096, 4094]
    const int b = in[0] - in[8];    // [-4095, 4095]
-    const int c = MUL2(in[4]) - MUL1(in[12]);   // [-3783, 3783]
-    const int d = MUL1(in[4]) + MUL2(in[12]);   // [-3785, 3781]
+    const int c = WEBP_TRANSFORM_AC3_MUL2(in[4]) -
+                  WEBP_TRANSFORM_AC3_MUL1(in[12]);  // [-3783, 3783]
+    const int d = WEBP_TRANSFORM_AC3_MUL1(in[4]) +
+                  WEBP_TRANSFORM_AC3_MUL2(in[12]);  // [-3785, 3781]
    tmp[0] = a + d;   // [-7881, 7875]
    tmp[1] = b + c;   // [-7878, 7878]
    tmp[2] = b - c;   // [-7878, 7878]
@ -69,8 +69,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
    const int dc = tmp[0] + 4;
    const int a =  dc +  tmp[8];
    const int b =  dc -  tmp[8];
-    const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
-    const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
+    const int c =
+        WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
+    const int d =
+        WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
    STORE(0, 0, a + d);
    STORE(1, 0, b + c);
    STORE(2, 0, b - c);
@ -81,22 +83,22 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
 }

 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst) {
  const int a = in[0] + 4;
-  const int c4 = MUL2(in[4]);
-  const int d4 = MUL1(in[4]);
-  const int c1 = MUL2(in[1]);
-  const int d1 = MUL1(in[1]);
+  const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
+  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
  STORE2(0, a + d4, d1, c1);
  STORE2(1, a + c4, d1, c1);
  STORE2(2, a - c4, d1, c1);
  STORE2(3, a - d4, d1, c1);
 }
-#undef MUL1
-#undef MUL2
 #undef STORE2

-static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne_C(in, dst);
  if (do_two) {
    TransformOne_C(in + 16, dst + 4);
@ -104,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

-static void TransformUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
+                          uint8_t* WEBP_RESTRICT dst) {
  VP8Transform(in + 0 * 16, dst, 1);
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformDC_C(const int16_t* in, uint8_t* dst) {
+static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
+                          uint8_t* WEBP_RESTRICT dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
@ -121,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

-static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
+                            uint8_t* WEBP_RESTRICT dst) {
  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -134,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
 // Paragraph 14.3

 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformWHT_C(const int16_t* in, int16_t* out) {
+static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
+                           int16_t* WEBP_RESTRICT out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
@ -162,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

-void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+VP8WHT VP8TransformWHT;

 //------------------------------------------------------------------------------
 // Intra predictions
@ -662,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,

 #if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                       int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                       int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 #if !WEBP_NEON_OMIT_C_CODE
-static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                        int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                        int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
@ -695,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,

 //------------------------------------------------------------------------------

-static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
-                               int dst_stride) {
+static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
+                               uint8_t* WEBP_RESTRICT dst, int dst_stride) {
  int i, j;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) {
@ -731,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;

-void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
-                            int dst_stride);
+void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
+                            uint8_t* WEBP_RESTRICT dst, int dst_stride);

 extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8DspInitSSE2(void);
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -18,8 +18,8 @@

 #include "src/dsp/mips_macro.h"

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;

 static WEBP_INLINE int abs_mips32(int x) {
  const int sign = x >> 31;
@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
 }

 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }

-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }

-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }

-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
@ -215,11 +215,12 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  }
 }

-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14;
-  int temp15, temp16, temp17, temp18;
+  int temp15, temp16, temp17, temp18, temp19;
  int16_t* p_in = (int16_t*)in;

  // loops unrolled and merged to avoid usage of tmp buffer
@ -233,16 +234,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
-    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
-    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp12)
+    MUL_SHIFT_C1_IO(temp4, temp19)
    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
    "lh       %[temp1],  2(%[in])                      \n\t"
    "lh       %[temp5],  10(%[in])                     \n\t"
    "lh       %[temp9],  18(%[in])                     \n\t"
    "lh       %[temp13], 26(%[in])                     \n\t"
    "sra      %[temp8],  %[temp8],  16                 \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
-    "sra      %[temp4],  %[temp4],  16                 \n\t"
    "sra      %[temp12], %[temp12], 16                 \n\t"
    "lh       %[temp2],  4(%[in])                      \n\t"
    "lh       %[temp6],  12(%[in])                     \n\t"
@ -261,49 +260,43 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
-    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
-    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp13)
+    MUL_SHIFT_C1_IO(temp5, temp19)
    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
    "sra      %[temp9],  %[temp9],  16                 \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
-    "sra      %[temp5],  %[temp5],  16                 \n\t"
    "sra      %[temp13], %[temp13], 16                 \n\t"
    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
-    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp14)
    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
-    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
+    MUL_SHIFT_C1_IO(temp6, temp19)
    "sra      %[temp14], %[temp14], 16                 \n\t"
    "sra      %[temp10], %[temp10], 16                 \n\t"
-    "sra      %[temp6],  %[temp6],  16                 \n\t"
    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
-    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp15)
    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
-    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
+    MUL_SHIFT_C1_IO(temp7, temp19)
    "addiu    %[temp8],  %[temp8],  4                  \n\t"
    "addiu    %[temp12], %[temp12], 4                  \n\t"
    "addiu    %[temp0],  %[temp0],  4                  \n\t"
    "addiu    %[temp4],  %[temp4],  4                  \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
    "sra      %[temp15], %[temp15], 16                 \n\t"
    "sra      %[temp11], %[temp11], 16                 \n\t"
-    "sra      %[temp7],  %[temp7],  16                 \n\t"
    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
@ -313,48 +306,40 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
-    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
-    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp11)
+    MUL_SHIFT_C1_IO(temp9, temp19)
    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
    "sra      %[temp10], %[temp10], 16                 \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
-    "sra      %[temp9],  %[temp9],  16                 \n\t"
    "sra      %[temp11], %[temp11], 16                 \n\t"
    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
-    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
-    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp9, temp15)
+    MUL_SHIFT_C1_IO(temp13, temp19)
    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
    "sra      %[temp14], %[temp14], 16                 \n\t"
-    "sra      %[temp9],  %[temp9],  16                 \n\t"
-    "sra      %[temp13], %[temp13], 16                 \n\t"
    "sra      %[temp15], %[temp15], 16                 \n\t"
    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
-    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
-    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp13, temp3)
+    MUL_SHIFT_C1_IO(temp1, temp19)
    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
    "sra      %[temp2],  %[temp2],  16                 \n\t"
-    "sra      %[temp13], %[temp13], 16                 \n\t"
-    "sra      %[temp1],  %[temp1],  16                 \n\t"
    "sra      %[temp3],  %[temp3],  16                 \n\t"
    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
-    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
-    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp1, temp7)
+    MUL_SHIFT_C1_IO(temp5, temp19)
    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
    "sra      %[temp6],  %[temp6],  16                 \n\t"
-    "sra      %[temp1],  %[temp1],  16                 \n\t"
-    "sra      %[temp5],  %[temp5],  16                 \n\t"
    "sra      %[temp7],  %[temp7],  16                 \n\t"
    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
@ -542,13 +527,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
-      [temp18]"=&r"(temp18)
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19)
    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
    : "memory", "hi", "lo"
  );
 }

-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@ -18,12 +18,11 @@

 #include "src/dsp/mips_macro.h"

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;

-#define MUL(a, b) (((a) * (b)) >> 16)
-
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* WEBP_RESTRICT in,
+                        uint8_t* WEBP_RESTRICT dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;

  __asm__ volatile (
@ -47,12 +46,13 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
  );
 }

-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
  const int a = in[0] + 4;
-  int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
+  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;

@ -83,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  );
 }

-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;

@ -150,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  );
 }

-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
@ -436,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
 }

 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }

-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
@ -467,20 +469,18 @@ static void HFilter16i(uint8_t* p, int stride,
  }
 }

-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }

-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }

-#undef MUL
-
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@ -37,10 +37,9 @@
  d1_m = d_tmp1_m + d_tmp2_m;                                    \
  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
-#define MULT1(a) ((((a) * 20091) >> 16) + (a))
-#define MULT2(a) (((a) * 35468) >> 16)

-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
  v8i16 input0, input1;
  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  v4i32 res0, res1, res2, res3;
@ -67,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }

-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
  }
 }

-static void TransformWHT(const int16_t* in, int16_t* out) {
+static void TransformWHT(const int16_t* WEBP_RESTRICT in,
+                         int16_t* WEBP_RESTRICT out) {
  v8i16 input0, input1;
  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@ -116,18 +117,20 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
  out[240] = __msa_copy_s_h(out1, 7);
 }

-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* WEBP_RESTRICT in,
+                        uint8_t* WEBP_RESTRICT dst) {
  const int DC = (in[0] + 4) >> 3;
  const v8i16 tmp0 = __msa_fill_h(DC);
  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
 }

-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
  const int a = in[0] + 4;
-  const int c4 = MULT2(in[4]);
-  const int d4 = MULT1(in[4]);
-  const int in2 = MULT2(in[1]);
-  const int in3 = MULT1(in[1]);
+  const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
+  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
+  const int in2 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int in3 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
  v4i32 tmp0 = { 0 };
  v4i32 out0 = __msa_fill_w(a + d4);
  v4i32 out1 = __msa_fill_w(a + c4);
@ -477,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
 }

 // 8-pixels wide variants, for chroma filtering
-static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
-                     int b_limit_in, int limit_in, int thresh_in) {
+static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
+                     int stride, int b_limit_in, int limit_in, int thresh_in) {
  uint8_t* ptmp_src_u = src_u - 4 * stride;
  uint8_t* ptmp_src_v = src_v - 4 * stride;
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
@ -522,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
  SD(q2_d, ptmp_src_v);
 }

-static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
-                     int b_limit_in, int limit_in, int thresh_in) {
+static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
+                     int stride, int b_limit_in, int limit_in, int thresh_in) {
  uint8_t* ptmp_src_u = src_u - 4;
  uint8_t* ptmp_src_v = src_v - 4;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -558,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
  ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
 }

-static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
+                      uint8_t* WEBP_RESTRICT src_v, int stride,
                      int b_limit_in, int limit_in, int thresh_in) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -589,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
  SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
 }

-static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
+                      uint8_t* WEBP_RESTRICT src_v, int stride,
                      int b_limit_in, int limit_in, int thresh_in) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
 #endif  // !WORK_AROUND_GCC

 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
  }
 }
-static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  u += 4 * stride;
@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 }

 #if !defined(WORK_AROUND_GCC)
-static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
  }
 }

-static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  u += 4;
@ -1000,8 +1002,9 @@ static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
 // same issue with kC1 and vqdmulh that we work around by down shifting kC2

-static const int16_t kC1 = 20091;
-static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int16_t kC2 =
+    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.

 #if defined(WEBP_USE_INTRINSICS)
 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
@ -1040,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
  Transpose8x2_NEON(E0, E1, rows);
 }

-static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
  TransformPass_NEON(&rows);
@ -1050,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {

 #else

-static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
  const int kBPS = BPS;
  // kC1, kC2. Padded because vld1.16 loads 8 bytes
  const int16_t constants[4] = { kC1, kC2, 0, 0 };
@ -1183,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {

 #endif    // WEBP_USE_INTRINSICS

-static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne_NEON(in, dst);
  if (do_two) {
    TransformOne_NEON(in + 16, dst + 4);
  }
 }

-static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
+                             uint8_t* WEBP_RESTRICT dst) {
  const int16x8_t DC = vdupq_n_s16(in[0]);
  Add4x4_NEON(DC, DC, dst);
 }
@ -1204,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)

-static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
+static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
+                              int16_t* WEBP_RESTRICT out) {
  int32x4x4_t tmp;

  {
@ -1255,15 +1263,13 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {

 //------------------------------------------------------------------------------

-#define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
-  static const int kC1_full = 20091 + (1 << 16);
-  static const int kC2_full = 35468;
+static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
  const int16x4_t A = vld1_dup_s16(in);
-  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
-  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
-  const int c1 = MUL(in[1], kC2_full);
-  const int d1 = MUL(in[1], kC1_full);
+  const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
+  const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
                      (uint64_t)( c1 & 0xffff) << 16 |
                      (uint64_t)(-c1 & 0xffff) << 32 |
@ -1274,7 +1280,6 @@ static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
  Add4x4_NEON(m0_m1, m2_m3, dst);
 }
-#undef MUL

 //------------------------------------------------------------------------------
 // 4x4
@ -1303,18 +1308,19 @@ static void DC4_NEON(uint8_t* dst) {    // DC
 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
-  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
+  const uint16x8_t d = vsubl_u8(T, TL);  // A[c] - A[-1]
  int y;
  for (y = 0; y < size; y += 4) {
    // left edge
-    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
-    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
-    const int16x8_t r1 = vaddq_s16(L1, d);
-    const int16x8_t r2 = vaddq_s16(L2, d);
-    const int16x8_t r3 = vaddq_s16(L3, d);
+    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
+    // L[r] + A[c] - A[-1]
+    const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
+    const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
+    const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
+    const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
    // Saturate and store the result.
    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
@ -1575,23 +1581,24 @@ static void TM16_NEON(uint8_t* dst) {
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
  // A[c] - A[-1]
-  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
-  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
+  const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
+  const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
  int y;
  for (y = 0; y < 16; y += 4) {
    // left edge
-    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
-    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
-    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
-    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
-    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
-    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
-    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
-    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
-    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
+    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
+    // L[r] + A[c] - A[-1]
+    const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
+    const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
+    const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
+    const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
+    const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
+    const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
+    const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
+    const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
    // Saturate and store the result.
    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -30,7 +30,8 @@
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

-static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst, int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -196,15 +197,14 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
 }

 #if (USE_TRANSFORM_AC3 == 1)
-#define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
-  static const int kC1 = 20091 + (1 << 16);
-  static const int kC2 = 35468;
+
+static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
  const __m128i A = _mm_set1_epi16(in[0] + 4);
-  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
-  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
+  const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
  const __m128i B = _mm_adds_epi16(A, CD);
  const __m128i m0 = _mm_adds_epi16(B, d4);
@ -238,7 +238,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
-#undef MUL
+
 #endif   // USE_TRANSFORM_AC3

 //------------------------------------------------------------------------------
@ -259,15 +259,15 @@ static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
  *x = _mm_packs_epi16(lo_1, hi_1);
 }

-#define FLIP_SIGN_BIT2(a, b) {                                                 \
+#define FLIP_SIGN_BIT2(a, b) do {                                              \
  (a) = _mm_xor_si128(a, sign_bit);                                            \
  (b) = _mm_xor_si128(b, sign_bit);                                            \
-}
+} while (0)

-#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
+#define FLIP_SIGN_BIT4(a, b, c, d) do {                                        \
  FLIP_SIGN_BIT2(a, b);                                                        \
  FLIP_SIGN_BIT2(c, d);                                                        \
-}
+} while (0)

 // input/output is uint8_t
 static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
@ -645,12 +645,12 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)

-#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
+#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) do {                          \
  (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]);                        \
  (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]);                        \
  (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]);                        \
  (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]);                        \
-}
+} while (0)

 #define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
@ -658,18 +658,18 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
  (p) = _mm_unpacklo_epi64(U, V);                                              \
 } while (0)

-#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
+#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) do {                     \
  LOADUV_H_EDGE(e1, u, v, 0 * (stride));                                       \
  LOADUV_H_EDGE(e2, u, v, 1 * (stride));                                       \
  LOADUV_H_EDGE(e3, u, v, 2 * (stride));                                       \
  LOADUV_H_EDGE(e4, u, v, 3 * (stride));                                       \
-}
+} while (0)

-#define STOREUV(p, u, v, stride) {                                             \
+#define STOREUV(p, u, v, stride) do {                                          \
  _mm_storel_epi64((__m128i*)&(u)[(stride)], p);                               \
  (p) = _mm_srli_si128(p, 8);                                                  \
  _mm_storel_epi64((__m128i*)&(v)[(stride)], p);                               \
-}
+} while (0)

 static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
                                         const __m128i* const p0,
@ -794,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
 }

 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, p2, p1, p0, q0, q1, q2;

@ -819,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q2, u, v, 2 * stride);
 }

-static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

@ -839,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
  Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
 }

-static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
@ -865,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q1, u, v, 1 * stride);
 }

-static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -60,53 +60,66 @@ extern "C" {
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
-typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                        int do_two);
-typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
-typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref,
+                        const int16_t* WEBP_RESTRICT in,
+                        uint8_t* WEBP_RESTRICT dst, int do_two);
+typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src,
+                        const uint8_t* WEBP_RESTRICT ref,
+                        int16_t* WEBP_RESTRICT out);
+typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in,
+                       int16_t* WEBP_RESTRICT out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
 extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
+typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst,
+                              const uint8_t* WEBP_RESTRICT left,
+                              const uint8_t* WEBP_RESTRICT top);
+typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst,
+                               const uint8_t* WEBP_RESTRICT top);
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredChroma8;

-typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix,
+                         const uint8_t* WEBP_RESTRICT ref);
 extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
-typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
-                          const uint16_t* const weights);
+typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix,
+                          const uint8_t* WEBP_RESTRICT ref,
+                          const uint16_t* WEBP_RESTRICT const weights);
 // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
 // 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;

 // Compute the average (DC) of four 4x4 blocks.
 // Each sub-4x4 block #i sum is stored in dc[i].
-typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
+typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref,
+                              uint32_t dc[4]);
 extern VP8MeanMetric VP8Mean16x4;

-typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src,
+                             uint8_t* WEBP_RESTRICT dst);
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 struct VP8Matrix;   // forward declaration
-typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                const struct VP8Matrix* const mtx);
+typedef int (*VP8QuantizeBlock)(
+    int16_t in[16], int16_t out[16],
+    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 // Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
-typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
-                                  const struct VP8Matrix* const mtx);
+typedef int (*VP8Quantize2Blocks)(
+    int16_t in[32], int16_t out[32],
+    const struct VP8Matrix* WEBP_RESTRICT const mtx);

 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 extern VP8Quantize2Blocks VP8EncQuantize2Blocks;

 // specific to 2nd transform:
-typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
-                                   const struct VP8Matrix* const mtx);
+typedef int (*VP8QuantizeBlockWHT)(
+    int16_t in[16], int16_t out[16],
+    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;

 extern const int VP8DspScan[16 + 4 + 4];
@ -118,9 +131,10 @@ typedef struct {
  int max_value;
  int last_non_zero;
 } VP8Histogram;
-typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+typedef void (*VP8CHisto)(const uint8_t* WEBP_RESTRICT ref,
+                          const uint8_t* WEBP_RESTRICT pred,
                          int start_block, int end_block,
-                          VP8Histogram* const histo);
+                          VP8Histogram* WEBP_RESTRICT const histo);
 extern VP8CHisto VP8CollectHistogram;
 // General-purpose util function to help VP8CollectHistogram().
 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
@ -138,8 +152,9 @@ extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
 extern const uint8_t VP8EncBands[16 + 1];

 struct VP8Residual;
-typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
-                                         struct VP8Residual* const res);
+typedef void (*VP8SetResidualCoeffsFunc)(
+    const int16_t* WEBP_RESTRICT const coeffs,
+    struct VP8Residual* WEBP_RESTRICT const res);
 extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;

 // Cost calculation function.
@ -193,9 +208,11 @@ void VP8SSIMDspInit(void);
 //------------------------------------------------------------------------------
 // Decoding

-typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
+                           uint8_t* WEBP_RESTRICT dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
-typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
+                            uint8_t* WEBP_RESTRICT dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
@ -203,6 +220,11 @@ extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
 extern VP8WHT VP8TransformWHT;

+#define WEBP_TRANSFORM_AC3_C1 20091
+#define WEBP_TRANSFORM_AC3_C2 35468
+#define WEBP_TRANSFORM_AC3_MUL1(a) ((((a) * WEBP_TRANSFORM_AC3_C1) >> 16) + (a))
+#define WEBP_TRANSFORM_AC3_MUL2(a) (((a) * WEBP_TRANSFORM_AC3_C2) >> 16)
+
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
@ -228,7 +250,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
 // regular filter (on both macroblock edges and inner edges)
 typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
                                  int thresh, int ithresh, int hev_t);
-typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
+                                    uint8_t* WEBP_RESTRICT v, int stride,
                                    int thresh, int ithresh, int hev_t);
 // on outer edge
 extern VP8LumaFilterFunc VP8VFilter16;
@ -248,8 +271,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
 #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
 #define VP8_DITHER_AMP_BITS 7
 #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
-extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
-                                   int dst_stride);
+extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
+                                   uint8_t* WEBP_RESTRICT dst, int dst_stride);

 // must be called before anything using the above
 void VP8DspInit(void);
@ -262,10 +285,10 @@ void VP8DspInit(void);
 // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
 // bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
-    const uint8_t* top_y, const uint8_t* bottom_y,
-    const uint8_t* top_u, const uint8_t* top_v,
-    const uint8_t* cur_u, const uint8_t* cur_v,
-    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+    const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
+    const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
+    const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
+    uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);

 #ifdef FANCY_UPSAMPLING

@ -275,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 #endif    // FANCY_UPSAMPLING

 // Per-row point-sampling methods.
-typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
-                                   const uint8_t* u, const uint8_t* v,
-                                   uint8_t* dst, int len);
+typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
+                                   const uint8_t* WEBP_RESTRICT u,
+                                   const uint8_t* WEBP_RESTRICT v,
+                                   uint8_t* WEBP_RESTRICT dst, int len);
 // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
-void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
-                             const uint8_t* u, const uint8_t* v, int uv_stride,
-                             uint8_t* dst, int dst_stride,
+void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v, int uv_stride,
+                             uint8_t* WEBP_RESTRICT dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func);

 // Sampling functions to convert rows of YUV to RGB(A)
@ -293,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);

 // YUV444->RGB converters
-typedef void (*WebPYUV444Converter)(const uint8_t* y,
-                                    const uint8_t* u, const uint8_t* v,
-                                    uint8_t* dst, int len);
+typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
+                                    const uint8_t* WEBP_RESTRICT u,
+                                    const uint8_t* WEBP_RESTRICT v,
+                                    uint8_t* WEBP_RESTRICT dst, int len);

 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];

@ -311,26 +337,35 @@ void WebPInitYUV444Converters(void);
 // ARGB -> YUV converters

 // Convert ARGB samples to luma Y.
-extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
+                                  uint8_t* WEBP_RESTRICT y, int width);
 // Convert ARGB samples to U/V with downsampling. do_store should be '1' for
 // even lines and '0' for odd ones. 'src_width' is the original width, not
 // the U/V one.
-extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
+                                   uint8_t* WEBP_RESTRICT u,
+                                   uint8_t* WEBP_RESTRICT v,
                                   int src_width, int do_store);

 // Convert a row of accumulated (four-values) of rgba32 toward U/V
-extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
-                                     uint8_t* u, uint8_t* v, int width);
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
+                                     uint8_t* WEBP_RESTRICT u,
+                                     uint8_t* WEBP_RESTRICT v, int width);

 // Convert RGB or BGR to Y
-extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
-extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
+                                   uint8_t* WEBP_RESTRICT y, int width);
+extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
+                                   uint8_t* WEBP_RESTRICT y, int width);

 // used for plain-C fallback.
-extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
+                                  uint8_t* WEBP_RESTRICT u,
+                                  uint8_t* WEBP_RESTRICT v,
                                  int src_width, int do_store);
-extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
-                                    uint8_t* u, uint8_t* v, int width);
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
+                                    uint8_t* WEBP_RESTRICT u,
+                                    uint8_t* WEBP_RESTRICT v, int width);

 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
@ -343,8 +378,9 @@ struct WebPRescaler;
 // Import a row of data and save its contribution in the rescaler.
 // 'channel' denotes the channel number to be imported. 'Expand' corresponds to
 // the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
-typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
+typedef void (*WebPRescalerImportRowFunc)(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);

 extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
 extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
@ -357,16 +393,19 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;

 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
-extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
+extern void WebPRescalerImportRowExpand_C(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
+extern void WebPRescalerImportRowShrink_C(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
 extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);

 // Main entry calls:
-extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
-                                  const uint8_t* src);
+extern void WebPRescalerImportRow(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
 // Export one row (starting at x_out position) from rescaler.
 extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);

@ -475,8 +514,9 @@ typedef enum {     // Filter types.
  WEBP_FILTER_FAST
 } WEBP_FILTER_TYPE;

-typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
-                               int stride, uint8_t* out);
+typedef void (*WebPFilterFunc)(const uint8_t* WEBP_RESTRICT in,
+                               int width, int height, int stride,
+                               uint8_t* WEBP_RESTRICT out);
 // In-place un-filtering.
 // Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
 typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
 }

 #if !WEBP_NEON_OMIT_C_CODE
-static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
+                               const uint8_t* WEBP_RESTRICT pred,
                               int start_block, int end_block,
-                               VP8Histogram* const histo) {
+                               VP8Histogram* WEBP_RESTRICT const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
@ -109,20 +110,19 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 #define STORE(x, y, v) \
  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
-
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
+                                      const int16_t* WEBP_RESTRICT in,
+                                      uint8_t* WEBP_RESTRICT dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
  for (i = 0; i < 4; ++i) {    // vertical pass
    const int a = in[0] + in[8];
    const int b = in[0] - in[8];
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+    const int c =
+        WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]);
+    const int d =
+        WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]);
    tmp[0] = a + d;
    tmp[1] = b + c;
    tmp[2] = b - c;
@ -134,10 +134,12 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  tmp = C;
  for (i = 0; i < 4; ++i) {    // horizontal pass
    const int dc = tmp[0] + 4;
-    const int a =  dc +  tmp[8];
-    const int b =  dc -  tmp[8];
-    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
-    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    const int a = dc + tmp[8];
+    const int b = dc - tmp[8];
+    const int c =
+        WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
+    const int d =
+        WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
    STORE(0, i, a + d);
    STORE(1, i, b + c);
    STORE(2, i, b - c);
@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  }
 }

-static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
+                         const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst,
                         int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  }
 }

-static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
+                         const uint8_t* WEBP_RESTRICT ref,
+                         int16_t* WEBP_RESTRICT out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

-static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
-                          int16_t* out) {
+static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
+                          const uint8_t* WEBP_RESTRICT ref,
+                          int16_t* WEBP_RESTRICT out) {
  VP8FTransform(src, ref, out);
  VP8FTransform(src + 4, ref + 4, out + 16);
 }

 #if !WEBP_NEON_OMIT_C_CODE
-static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
+                            int16_t* WEBP_RESTRICT out) {
  // input is 12b signed
  int32_t tmp[16];
  int i;
@ -222,7 +230,6 @@ static void FTransformWHT_C(const int16_t* in, int16_t* out) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

-#undef MUL
 #undef STORE

 //------------------------------------------------------------------------------
@ -235,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  }
 }

-static WEBP_INLINE void VerticalPred(uint8_t* dst,
-                                     const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
+                                     const uint8_t* WEBP_RESTRICT top,
+                                     int size) {
  int j;
  if (top != NULL) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@ -245,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
  }
 }

-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
-                                       const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
+                                       const uint8_t* WEBP_RESTRICT left,
+                                       int size) {
  if (left != NULL) {
    int j;
    for (j = 0; j < size; ++j) {
@ -257,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
  }
 }

-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT left,
+                                   const uint8_t* WEBP_RESTRICT top, int size) {
  int y;
  if (left != NULL) {
    if (top != NULL) {
@ -287,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
  }
 }

-static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
-                               const uint8_t* top,
+static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
+                               const uint8_t* WEBP_RESTRICT left,
+                               const uint8_t* WEBP_RESTRICT top,
                               int size, int round, int shift) {
  int DC = 0;
  int j;
@ -313,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
-                               const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
+                               const uint8_t* WEBP_RESTRICT left,
+                               const uint8_t* WEBP_RESTRICT top) {
  // U block
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
@ -333,22 +345,28 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds_C(uint8_t* dst,
-                           const uint8_t* left, const uint8_t* top) {
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
+static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
+                           const uint8_t* WEBP_RESTRICT left,
+                           const uint8_t* WEBP_RESTRICT top) {
  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  VerticalPred(I16VE16 + dst, top, 16);
  HorizontalPred(I16HE16 + dst, left, 16);
  TrueMotion(I16TM16 + dst, left, top, 16);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64

 //------------------------------------------------------------------------------
 // luma 4x4 prediction

+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
+
 #define DST(x, y) dst[(x) + (y) * BPS]
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

-static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+// vertical
+static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
    AVG3(top[ 0], top[1], top[2]),
@ -361,7 +379,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
  }
 }

-static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+// horizontal
+static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -373,14 +392,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }

-static void DC4(uint8_t* dst, const uint8_t* top) {
+static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  Fill(dst, dc >> 3, 4);
 }

-static void RD4(uint8_t* dst, const uint8_t* top) {
+static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -399,7 +418,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
  DST(3, 0)                                     = AVG3(D, C, B);
 }

-static void LD4(uint8_t* dst, const uint8_t* top) {
+static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
@ -417,7 +436,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
  DST(3, 3)                                     = AVG3(G, H, H);
 }

-static void VR4(uint8_t* dst, const uint8_t* top) {
+static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -439,7 +458,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
  DST(3, 1) =             AVG3(B, C, D);
 }

-static void VL4(uint8_t* dst, const uint8_t* top) {
+static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
@ -461,7 +480,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
              DST(3, 3) = AVG3(F, G, H);
 }

-static void HU4(uint8_t* dst, const uint8_t* top) {
+static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@ -476,7 +495,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static void HD4(uint8_t* dst, const uint8_t* top) {
+static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -499,7 +518,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }

-static void TM4(uint8_t* dst, const uint8_t* top) {
+static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int x, y;
  const uint8_t* const clip = clip1 + 255 - top[-1];
  for (y = 0; y < 4; ++y) {
@ -517,7 +536,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) {

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
+                          const uint8_t* WEBP_RESTRICT top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -530,11 +550,14 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
  HU4(I4HU4 + dst, top);
 }

+#endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
+
 //------------------------------------------------------------------------------
 // Metric

 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
+                              const uint8_t* WEBP_RESTRICT b,
                              int w, int h) {
  int count = 0;
  int y, x;
@ -549,21 +572,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
  return count;
 }

-static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
+                      const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
+                     const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
+                    const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
+                    const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 4, 4);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE

-static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
  int k, x, y;
  for (k = 0; k < 4; ++k) {
    uint32_t avg = 0;
@ -587,7 +614,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform(const uint8_t* WEBP_RESTRICT in,
+                      const uint16_t* WEBP_RESTRICT w) {
  int sum = 0;
  int tmp[16];
  int i;
@ -621,15 +649,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  return sum;
 }

-static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
+                      const uint8_t* WEBP_RESTRICT const b,
+                      const uint16_t* WEBP_RESTRICT const w) {
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
  return abs(sum2 - sum1) >> 5;
 }

-static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
+                        const uint8_t* WEBP_RESTRICT const b,
+                        const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -645,13 +675,14 @@ static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static const uint8_t kZigzag[16] = {
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 };

 // Simple quantization
 static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
-                           const VP8Matrix* const mtx) {
+                           const VP8Matrix* WEBP_RESTRICT const mtx) {
  int last = -1;
  int n;
  for (n = 0; n < 16; ++n) {
@ -676,9 +707,8 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
-                             const VP8Matrix* const mtx) {
+                             const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -689,7 +719,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
 //------------------------------------------------------------------------------
 // Block copy

-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
+static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
+                             uint8_t* WEBP_RESTRICT dst, int w, int h) {
  int y;
  for (y = 0; y < h; ++y) {
    memcpy(dst, src, w);
@ -698,11 +729,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  }
 }

-static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
+                      uint8_t* WEBP_RESTRICT dst) {
  Copy(src, dst, 4, 4);
 }

-static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
+                       uint8_t* WEBP_RESTRICT dst) {
  Copy(src, dst, 16, 8);
 }

@ -761,14 +794,19 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
  VP8EncQuantizeBlock = QuantizeBlock_C;
  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
+  VP8EncPredLuma4 = Intra4Preds_C;
+#endif
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
+  VP8EncPredLuma16 = Intra16Preds_C;
 #endif

  VP8FTransform2 = FTransform2_C;
-  VP8EncPredLuma4 = Intra4Preds_C;
-  VP8EncPredLuma16 = Intra16Preds_C;
  VP8EncPredChroma8 = IntraChromaPreds_C;
  VP8Mean16x4 = Mean16x4_C;
-  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
  VP8Copy4x4 = Copy4x4_C;
  VP8Copy16x8 = Copy16x8_C;

--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -21,8 +21,8 @@
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/cost_enc.h"

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;

 // macro for one vertical pass in ITransformOne
 // MUL macro inlined
@ -30,7 +30,7 @@ static const int kC2 = 35468;
 // A..D - offsets in bytes to load from in buffer
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
-#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
@ -38,12 +38,10 @@ static const int kC2 = 35468;
  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
-  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
-  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  MUL_SHIFT_C1_IO(temp17, temp18)                                           \
+  MUL_SHIFT_C1(temp18, temp19)                                              \
  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
-  "sra     %[temp18],      %[temp18],      16                \n\n"          \
-  "sra     %[temp17],      %[temp17],      16                \n\n"          \
  "sra     %[temp19],      %[temp19],      16                \n\n"          \
  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
@ -58,17 +56,15 @@ static const int kC2 = 35468;
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
-  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
-  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  MUL_SHIFT_C1_IO(TEMP4, TEMP8)                                               \
+  MUL_SHIFT_C1(TEMP8, TEMP12)                                                 \
  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
-  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
-  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
@ -113,9 +109,9 @@ static const int kC2 = 35468;
  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"

 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
-                                             const int16_t* in,
-                                             uint8_t* dst) {
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref,
+                                             const int16_t* WEBP_RESTRICT in,
+                                             uint8_t* WEBP_RESTRICT dst) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -145,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
  );
 }

-static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
-                              uint8_t* dst, int do_two) {
+static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref,
+                              const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne_MIPS32(ref, in, dst);
  if (do_two) {
    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
@ -240,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
 }

 static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
-                                  const VP8Matrix* const mtx) {
+                                  const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -362,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
  "msub   %[temp6],  %[temp0]                \n\t"                \
  "msub   %[temp7],  %[temp1]                \n\t"

-static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a,
+                           const uint8_t* WEBP_RESTRICT const b,
+                           const uint16_t* WEBP_RESTRICT const w) {
  int tmp[32];
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

@ -397,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS

-static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
-                             const uint16_t* const w) {
+static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a,
+                             const uint8_t* WEBP_RESTRICT const b,
+                             const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -479,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

-static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
-                              int16_t* out) {
+static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src,
+                              const uint8_t* WEBP_RESTRICT ref,
+                              int16_t* WEBP_RESTRICT out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
  int temp17, temp18, temp19, temp20;
@ -541,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)

-static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                           const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -575,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                          const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -601,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

@ -623,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@ -20,8 +20,8 @@
 #include "src/enc/cost_enc.h"
 #include "src/enc/vp8i_enc.h"

-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;

 // O - output
 // I - input (macro doesn't change it)
@ -141,8 +141,9 @@ static const int kC2 = 35468;
  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"

-static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
-                                 int16_t* out) {
+static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
+                                 const uint8_t* WEBP_RESTRICT ref,
+                                 int16_t* WEBP_RESTRICT out) {
  const int c2217 = 2217;
  const int c5352 = 5352;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS

-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
+                                      const int16_t* WEBP_RESTRICT in,
+                                      uint8_t* WEBP_RESTRICT dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;

@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  );
 }

-static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
-                                 uint8_t* dst, int do_two) {
+static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
+                                 const int16_t* WEBP_RESTRICT in,
+                                 uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
-                              const uint16_t* const w) {
+static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
+                              const uint8_t* WEBP_RESTRICT const b,
+                              const uint16_t* WEBP_RESTRICT const w) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;

@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
  return abs(temp3 - temp17) >> 5;
 }

-static int Disto16x16_MIPSdspR2(const uint8_t* const a,
-                                const uint8_t* const b,
-                                const uint16_t* const w) {
+static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
+                                const uint8_t* WEBP_RESTRICT const b,
+                                const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a,
 } while (0)

 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
-static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
-                                           const uint8_t* (TOP)) {             \
+static WEBP_INLINE void VerticalPred##SIZE(                                    \
+    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) {        \
  int j;                                                                       \
  if ((TOP)) {                                                                 \
    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16)
 #undef VERTICAL_PRED

 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
-static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
-                                             const uint8_t* (LEFT)) {          \
+static WEBP_INLINE void HorizontalPred##SIZE(                                  \
+    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) {       \
  if (LEFT) {                                                                  \
    int j;                                                                     \
    for (j = 0; j < (SIZE); ++j) {                                             \
@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16)
 } while (0)

 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
-static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
-                                         const uint8_t* (TOP)) {               \
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST),         \
+                                         const uint8_t* WEBP_RESTRICT (LEFT),  \
+                                         const uint8_t* WEBP_RESTRICT (TOP)) { \
  if ((LEFT) != NULL) {                                                        \
    if ((TOP) != NULL) {                                                       \
      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16)
 #undef CLIP_8B_TO_DST
 #undef CLIPPING

-static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT left,
+                                 const uint8_t* WEBP_RESTRICT top) {
  int DC, DC1;
  int temp0, temp1, temp2, temp3;

@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
  FILL_8_OR_16(dst, DC, 16);
 }

-static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
-                                const uint8_t* top) {
+static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
+                                const uint8_t* WEBP_RESTRICT left,
+                                const uint8_t* WEBP_RESTRICT top) {
  int DC, DC1;
  int temp0, temp1, temp2, temp3;

@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
  FILL_8_OR_16(dst, DC, 8);
 }

-static void DC4(uint8_t* dst, const uint8_t* top) {
+static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1;
  __asm__ volatile(
    "ulw          %[temp0],   0(%[top])               \n\t"
@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void TM4(uint8_t* dst, const uint8_t* top) {
+static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
  const int c35 = 0xff00ff;
  __asm__ volatile (
@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void VE4(uint8_t* dst, const uint8_t* top) {
+static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  __asm__ volatile(
    "ulw             %[temp0],   -1(%[top])              \n\t"
@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void HE4(uint8_t* dst, const uint8_t* top) {
+static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  __asm__ volatile(
    "ulw             %[temp0],   -4(%[top])              \n\t"
@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void RD4(uint8_t* dst, const uint8_t* top) {
+static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int temp6, temp7, temp8, temp9, temp10, temp11;
  __asm__ volatile(
@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void VR4(uint8_t* dst, const uint8_t* top) {
+static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  __asm__ volatile (
@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void LD4(uint8_t* dst, const uint8_t* top) {
+static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int temp6, temp7, temp8, temp9, temp10, temp11;
  __asm__ volatile(
@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void VL4(uint8_t* dst, const uint8_t* top) {
+static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  __asm__ volatile (
@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void HD4(uint8_t* dst, const uint8_t* top) {
+static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  __asm__ volatile (
@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
  );
 }

-static void HU4(uint8_t* dst, const uint8_t* top) {
+static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  __asm__ volatile (
    "ulw             %[temp0],   -5(%[top])              \n\t"
@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

-static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
-                                       const uint8_t* top) {
+static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
+                                       const uint8_t* WEBP_RESTRICT left,
+                                       const uint8_t* WEBP_RESTRICT top) {
  // U block
  DCMode8(C8DC8 + dst, left, top);
  VerticalPred8(C8VE8 + dst, top);
@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

-static void Intra16Preds_MIPSdspR2(uint8_t* dst,
-                                   const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT left,
+                                   const uint8_t* WEBP_RESTRICT top) {
  DCMode16(I16DC16 + dst, left, top);
  VerticalPred16(I16VE16 + dst, top);
  HorizontalPred16(I16HE16 + dst, left);
@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst,

 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
  GET_SSE_INNER(C)                        \
  GET_SSE_INNER(D)

-static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                              const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                             const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }

-static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
 "3:                                                          \n\t"

 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
-                                   const VP8Matrix* const mtx) {
+                                   const VP8Matrix* WEBP_RESTRICT const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
  int sign, coeff, level;
  int max_level = MAX_LEVEL;
@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
 }

 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
-                                     const VP8Matrix* const mtx) {
+                                     const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"

-static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
+                                    int16_t* WEBP_RESTRICT out) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;

--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@ -41,8 +41,9 @@
  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
 } while (0)

-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
+                                      const int16_t* WEBP_RESTRICT in,
+                                      uint8_t* WEBP_RESTRICT dst) {
  v8i16 input0, input1;
  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  v4i32 res0, res1, res2, res3;
@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }

-static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                           int do_two) {
+static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref,
+                           const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }

-static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
-                           int16_t* out) {
+static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src,
+                           const uint8_t* WEBP_RESTRICT ref,
+                           int16_t* WEBP_RESTRICT out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
  SD4(out0, out1, out2, out3, out, 8);
 }

-static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in,
+                              int16_t* WEBP_RESTRICT out) {
  v8i16 in0 = { 0 };
  v8i16 in1 = { 0 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
  ST_SH2(out0, out1, out, 8);
 }

-static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in,
+                          const uint16_t* WEBP_RESTRICT w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
  v16i8 src0 = { 0 };
@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
  return sum;
 }

-static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
+static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a,
+                        const uint8_t* WEBP_RESTRICT const b,
+                        const uint16_t* WEBP_RESTRICT const w) {
  const int sum1 = TTransform_MSA(a, w);
  const int sum2 = TTransform_MSA(b, w);
  return abs(sum2 - sum1) >> 5;
 }

-static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
+static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a,
+                          const uint8_t* WEBP_RESTRICT const b,
+                          const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+// vertical
+static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top - 1);
  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
  SW4(out, out, out, out, dst, BPS);
 }

-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+// horizontal
+static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }

-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
  SW4(dc, dc, dc, dc, dst, BPS);
 }

-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const v16u8 A2 = { 0 };
  const uint64_t val_m = LD(top - 5);
  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
  SW4(val3, val2, val1, val0, dst, BPS);
 }

-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top);
  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
  SW4(val0, val1, val2, val3, dst, BPS);
 }

-static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
  DST(3, 1) =             AVG3(B, C, D);
 }

-static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
              DST(3, 3) = AVG3(F, G, H);
 }

-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }

-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  const v16i8 zero = { 0 };
  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG2

-static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
 } while (0)

-static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst,
+                                          const uint8_t* WEBP_RESTRICT top) {
  if (top != NULL) {
    const v16u8 out = LD_UB(top);
    STORE16x16(out, dst);
@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
  }
 }

-static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
-                                            const uint8_t* left) {
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst,
+                                            const uint8_t* WEBP_RESTRICT left) {
  if (left != NULL) {
    int j;
    for (j = 0; j < 16; j += 4) {
@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
  }
 }

-static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
-                                        const uint8_t* top) {
+static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT left,
+                                        const uint8_t* WEBP_RESTRICT top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
  }
 }

-static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
-                                    const uint8_t* top) {
+static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst,
+                                    const uint8_t* WEBP_RESTRICT left,
+                                    const uint8_t* WEBP_RESTRICT top) {
  int DC;
  v16u8 out;
  if (top != NULL && left != NULL) {
@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
  STORE16x16(out, dst);
 }

-static void Intra16Preds_MSA(uint8_t* dst,
-                             const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst,
+                             const uint8_t* WEBP_RESTRICT left,
+                             const uint8_t* WEBP_RESTRICT top) {
  DCMode16x16(I16DC16 + dst, left, top);
  VerticalPred16x16(I16VE16 + dst, top);
  HorizontalPred16x16(I16HE16 + dst, left);
@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst,
  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
 } while (0)

-static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT top) {
  if (top != NULL) {
    const uint64_t out = LD(top);
    STORE8x8(out, dst);
@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
  }
 }

-static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst,
+                                          const uint8_t* WEBP_RESTRICT left) {
  if (left != NULL) {
    int j;
    for (j = 0; j < 8; j += 4) {
@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
  }
 }

-static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
-                                      const uint8_t* top) {
+static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst,
+                                      const uint8_t* WEBP_RESTRICT left,
+                                      const uint8_t* WEBP_RESTRICT top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
  }
 }

-static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT left,
+                                  const uint8_t* WEBP_RESTRICT top) {
  uint64_t out;
  v16u8 src = { 0 };
  if (top != NULL && left != NULL) {
@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
  STORE8x8(out, dst);
 }

-static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT left,
+                                 const uint8_t* WEBP_RESTRICT top) {
  // U block
  DCMode8x8(C8DC8 + dst, left, top);
  VerticalPred8x8(C8VE8 + dst, top);
@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)

-static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a,
+                        const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a,
+                      const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }

-static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a,
+                      const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum = 0;
  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
 // Quantization

 static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
-                             const VP8Matrix* const mtx) {
+                             const VP8Matrix* WEBP_RESTRICT const mtx) {
  int sum;
  v8i16 in0, in1, sh0, sh1, out0, out1;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
 }

 static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
-                               const VP8Matrix* const mtx) {
+                               const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -27,8 +27,9 @@
 // This code is pretty much the same as TransformOne in the dec_neon.c, except
 // for subtraction to *ref. See the comments there for algorithmic explanations.

-static const int16_t kC1 = 20091;
-static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int16_t kC2 =
+    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.

 // This code works but is *slower* than the inlined-asm version below
 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
@ -59,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,

 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
                                    const int16x8_t row23,
-                                    const uint8_t* const ref,
-                                    uint8_t* const dst) {
+                                    const uint8_t* WEBP_RESTRICT const ref,
+                                    uint8_t* WEBP_RESTRICT const dst) {
  uint32x2_t dst01 = vdup_n_u32(0);
  uint32x2_t dst23 = vdup_n_u32(0);

@ -119,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
  Transpose8x2_NEON(E0, E1, rows);
 }

-static void ITransformOne_NEON(const uint8_t* ref,
-                               const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
+                               const int16_t* WEBP_RESTRICT in,
+                               uint8_t* WEBP_RESTRICT dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
  TransformPass_NEON(&rows);
@ -130,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref,

 #else

-static void ITransformOne_NEON(const uint8_t* ref,
-                               const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
+                               const int16_t* WEBP_RESTRICT in,
+                               uint8_t* WEBP_RESTRICT dst) {
  const int kBPS = BPS;
  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };

@ -246,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref,

 #endif    // WEBP_USE_INTRINSICS

-static void ITransform_NEON(const uint8_t* ref,
-                            const int16_t* in, uint8_t* dst, int do_two) {
+static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
+                            const int16_t* WEBP_RESTRICT in,
+                            uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne_NEON(ref, in, dst);
  if (do_two) {
    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
@ -293,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
  return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }

-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
+                            const uint8_t* WEBP_RESTRICT ref,
+                            int16_t* WEBP_RESTRICT out) {
  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
  {
    const uint8x16_t S0 = Load4x4_NEON(src);
@ -363,8 +368,9 @@ static const int32_t kCoeff32[] = {
  51000, 51000, 51000, 51000
 };

-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
+                            const uint8_t* WEBP_RESTRICT ref,
+                            int16_t* WEBP_RESTRICT out) {
  const int kBPS = BPS;
  const uint8_t* src_ptr = src;
  const uint8_t* ref_ptr = ref;
@ -483,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
  src += stride;                                    \
 } while (0)

-static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
+                               int16_t* WEBP_RESTRICT out) {
  const int stride = 16;
  const int16x4_t zero = vdup_n_s16(0);
  int32x4x4_t tmp0;
@ -658,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
-                         const uint16_t* const w) {
+static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
+                         const uint8_t* WEBP_RESTRICT const b,
+                         const uint16_t* WEBP_RESTRICT const w) {
  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -700,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
 }
 #undef LOAD_LANE_32b

-static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
+                           const uint8_t* WEBP_RESTRICT const b,
+                           const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -714,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,

 //------------------------------------------------------------------------------

-static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
+                                  const uint8_t* WEBP_RESTRICT pred,
                                  int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+                                  VP8Histogram* WEBP_RESTRICT const histo) {
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -746,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,

 //------------------------------------------------------------------------------

-static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
-                                             const uint8_t* const b,
-                                             uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16_NEON(
+    const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
+    uint32x4_t* const sum) {
  const uint8x16_t a0 = vld1q_u8(a);
  const uint8x16_t b0 = vld1q_u8(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -774,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) {
 #endif
 }

-static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 16; ++y) {
@ -783,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
  return SumToInt_NEON(sum);
 }

-static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
+                        const uint8_t* WEBP_RESTRICT b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
@ -792,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
  return SumToInt_NEON(sum);
 }

-static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
@ -805,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
  return SumToInt_NEON(sum);
 }

-static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
  const uint8x16_t a0 = Load4x4_NEON(a);
  const uint8x16_t b0 = Load4x4_NEON(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -824,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)

-static int16x8_t Quantize_NEON(int16_t* const in,
-                               const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
+                               const VP8Matrix* WEBP_RESTRICT const mtx,
+                               int offset) {
  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -859,7 +874,7 @@ static const uint8_t kShuffles[4][8] = {
 };

 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
+                              const VP8Matrix* WEBP_RESTRICT const mtx) {
  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
  uint8x8x4_t shuffles;
@ -901,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
 }

 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+                                const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -910,6 +925,283 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],

 #endif   // !WORK_AROUND_GCC

+#if WEBP_AARCH64
+
+#if BPS == 32
+#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane)                              \
+  do {                                                                         \
+    uint8x16_t r;                                                              \
+    r = vqtbl2q_u8(qcombined, tbl);                                            \
+    r = vreinterpretq_u8_u32(                                                  \
+        vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane),          \
+                       vreinterpretq_u32_u8(r), 1));                           \
+    vst1q_u8(dst, r);                                                          \
+  } while (0)
+
+#define RD4_VR4_LD4_VL4_NEON(dst, tbl)                                         \
+  do {                                                                         \
+    uint8x16_t r;                                                              \
+    r = vqtbl2q_u8(qcombined, tbl);                                            \
+    vst1q_u8(dst, r);                                                          \
+  } while (0)
+
+static WEBP_INLINE uint8x16x4_t Vld1qU8x4(const uint8_t* ptr) {
+#if LOCAL_CLANG_PREREQ(3, 4) || LOCAL_GCC_PREREQ(9, 4) || defined(_MSC_VER)
+  return vld1q_u8_x4(ptr);
+#else
+  uint8x16x4_t res;
+  INIT_VECTOR4(res,
+               vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+               vld1q_u8(ptr + 2 * 16), vld1q_u8(ptr + 3 * 16));
+  return res;
+#endif
+}
+
+static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
+                             const uint8_t* WEBP_RESTRICT top) {
+  // 0   1   2   3   4   5   6   7   8   9  10  11  12  13
+  //     L   K   J   I   X   A   B   C   D   E   F   G   H
+  //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
+  static const uint8_t kLookupTbl1[64] = {
+    0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 12, 12,
+    3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0,
+    4, 20, 21, 22,  3, 18,  2, 17,  3, 19,  4, 20,  2, 17,  1, 16,
+    2, 18,  3, 19,  1, 16, 31, 31,  1, 17,  2, 18, 31, 31, 31, 31
+  };
+
+  static const uint8_t kLookupTbl2[64] = {
+    20, 21, 22, 23,  5,  6,  7,  8, 22, 23, 24, 25,  6,  7,  8,  9,
+    19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
+    18, 19, 20, 21, 19,  5,  6,  7, 24, 25, 26, 27,  7,  8,  9, 26,
+    17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
+  };
+
+  static const uint8_t kLookupTbl3[64] = {
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 19, 19, 19, 19,
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 18, 18, 18, 18,
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 17, 17, 17, 17,
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 16, 16, 16, 16
+  };
+
+  const uint8x16x4_t lookup_avgs1 = Vld1qU8x4(kLookupTbl1);
+  const uint8x16x4_t lookup_avgs2 = Vld1qU8x4(kLookupTbl2);
+  const uint8x16x4_t lookup_avgs3 = Vld1qU8x4(kLookupTbl3);
+
+  const uint8x16_t preload = vld1q_u8(top - 5);
+  uint8x16x2_t qcombined;
+  uint8x16_t result0, result1;
+
+  uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
+  uint8x16_t b = preload;
+  uint8x16_t c = vextq_u8(a, a, 2);
+
+  uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
+  uint8x16_t avg2_all = vrhaddq_u8(a, b);
+
+  uint8x8_t preload_x8, sub_a, sub_c;
+  uint8_t result_u8;
+  uint8x8_t res_lo, res_hi;
+  uint8x16_t full_b;
+  uint16x8_t sub, sum_lo, sum_hi;
+
+  preload_x8 = vget_low_u8(c);
+  preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
+
+  result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
+
+  avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
+  avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
+
+  qcombined.val[0] = avg2_all;
+  qcombined.val[1] = avg3_all;
+
+  sub_a = vdup_laneq_u8(preload, 4);
+
+  // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
+  full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
+  // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
+  sub_c = vreinterpret_u8_u32(vdup_n_u32(
+      vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
+
+  sub = vsubl_u8(sub_c, sub_a);
+  sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
+  res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
+
+  sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
+  res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
+
+  // DC4, VE4, HE4, TM4
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
+
+  // RD4, VR4, LD4, VL4
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
+
+  // HD4, HU4
+  result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
+  result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
+
+  vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
+  vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
+  vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
+  vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
+}
+#endif  // BPS == 32
+
+static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
+  uint8x16_t a = vdupq_n_u8(value);
+  int i;
+  for (i = 0; i < 16; i++) {
+    vst1q_u8(dst + BPS * i, a);
+  }
+}
+
+static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
+  uint8x16_t a = vld1q_u8(src);
+  int i;
+  for (i = 0; i < 16; i++) {
+    vst1q_u8(dst + BPS * i, a);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
+                                              const uint8_t* left) {
+  uint8x16_t a;
+
+  if (left == NULL) {
+    Fill_NEON(dst, 129);
+    return;
+  }
+
+  a = vld1q_u8(left + 0);
+  vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
+  vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
+  vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
+  vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
+  vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
+  vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
+  vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
+  vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
+  vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
+  vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
+  vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
+  vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
+  vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
+  vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
+  vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
+  vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
+}
+
+static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    Fill16_NEON(dst, top);
+  } else {
+    Fill_NEON(dst, 127);
+  }
+}
+
+static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
+                                    const uint8_t* top) {
+  uint8_t s;
+
+  if (top != NULL) {
+    uint16_t dc;
+    dc = vaddlvq_u8(vld1q_u8(top));
+    if (left != NULL) {
+      // top and left present.
+      dc += vaddlvq_u8(vld1q_u8(left));
+      s = vqrshrnh_n_u16(dc, 5);
+    } else {
+      // top but no left.
+      s = vqrshrnh_n_u16(dc, 4);
+    }
+  } else {
+    if (left != NULL) {
+      uint16_t dc;
+      // left but no top.
+      dc = vaddlvq_u8(vld1q_u8(left));
+      s = vqrshrnh_n_u16(dc, 4);
+    } else {
+      // No top, no left, nothing.
+      s = 0x80;
+    }
+  }
+  Fill_NEON(dst, s);
+}
+
+static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
+                                              const uint8x8_t outer,
+                                              const uint8x8x2_t inner,
+                                              const uint16x8_t a, int i,
+                                              const int n) {
+  uint8x8_t d1, d2;
+  uint16x8_t r1, r2;
+
+  r1 = vaddl_u8(outer, inner.val[0]);
+  r1 = vqsubq_u16(r1, a);
+  d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
+  r2 = vaddl_u8(outer, inner.val[1]);
+  r2 = vqsubq_u16(r2, a);
+  d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
+  vst1_u8(dst + BPS * (i * 4 + n), d1);
+  vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
+}
+
+static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top) {
+  int i;
+  uint16x8_t a;
+  uint8x8x2_t inner;
+
+  if (left == NULL) {
+    // True motion without left samples (hence: with default 129 value) is
+    // equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is then
+    // 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred16_NEON(dst, top);
+    } else {
+      Fill_NEON(dst, 129);
+    }
+    return;
+  }
+
+  // left is not NULL.
+  if (top == NULL) {
+    HorizontalPred16_NEON(dst, left);
+    return;
+  }
+
+  // Neither left nor top are NULL.
+  a = vdupq_n_u16(left[-1]);
+  inner = vld1_u8_x2(top);
+
+  for (i = 0; i < 4; i++) {
+    const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
+
+    TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
+    TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
+    TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
+    TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
+  }
+}
+
+static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
+                              const uint8_t* WEBP_RESTRICT left,
+                              const uint8_t* WEBP_RESTRICT top) {
+  DCMode_NEON(I16DC16 + dst, left, top);
+  VerticalPred16_NEON(I16VE16 + dst, top);
+  HorizontalPred16_NEON(I16HE16 + dst, left);
+  TrueMotion_NEON(I16TM16 + dst, left, top);
+}
+
+#endif // WEBP_AARCH64
+
 //------------------------------------------------------------------------------
 // Entry point

@ -930,9 +1222,17 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
  VP8SSE8x8 = SSE8x8_NEON;
  VP8SSE4x4 = SSE4x4_NEON;

+#if WEBP_AARCH64
+#if BPS == 32
+  VP8EncPredLuma4 = Intra4Preds_NEON;
+#endif
+  VP8EncPredLuma16 = Intra16Preds_NEON;
+#endif
+
 #if !defined(WORK_AROUND_GCC)
  VP8EncQuantizeBlock = QuantizeBlock_NEON;
  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
 #endif
 }

--- a/Show More
+++ b/Show More