[M108-LTS] Fix OOB write in BuildHuffmanTable.

M108 merge issues: dec/vp8l_dec.c: - Conflicting checks before ReadHuffmanCodeLengths() return statement; In 114, an assignment follows the check instead of a return. - ReadHuffmanCodes(): Conflict after the changed huffman_tables check, there's an assignment in 114 instead of the setter call. First, BuildHuffmanTable is called to check if the data is valid. If it is and the table is not big enough, more memory is allocated. This will make sure that valid (but unoptimized because of unbalanced codes) streams are still decodable. Bug: chromium:1479274 Change-Id: I31c36dbf3aa78d35ecf38706b50464fd3d375741 (cherry picked from commit 902bc91903)
Merge "lossless: fix crunch mode w/WEBP_REDUCE_SIZE" into main
2025-07-15 13:29:54 +02:00 · 2023-09-12 09:30:11 +00:00 · 2022-07-21 16:27:38 +00:00 · 2022-07-20 22:04:34 -07:00 · 2022-07-20 10:36:23 -07:00 · 2022-07-15 16:00:00 -07:00
112 changed files with 4747 additions and 3159 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,7 +3,9 @@
 *.pc
 .DS_Store
 .deps
+.idea
 .libs
+.vscode
 /aclocal.m4
 /ar-lib
 /autom4te.cache
--- a/.mailmap
+++ b/.mailmap
@ -8,8 +8,11 @@ Vikas Arora <vikasa@google.com>
 <vikasa@google.com> <vikaas.arora@gmail.com>
 <slobodan.prijic@imgtec.com> <Slobodan.Prijic@imgtec.com>
 <vrabaud@google.com> <vincent.rabaud@gmail.com>
+Vincent Rabaud <vrabaud@google.com>
 Tamar Levy <tamar.levy@intel.com>
 <qrczak@google.com> <qrczak>
 Hui Su <huisu@google.com>
 James Zern <jzern@google.com>
 Roberto Alanis <alanisbaez@google.com>
+Brian Ledger <brianpl@google.com>
+Maryla Ustarroz-Calonge <maryla@google.com>
--- a/4
+++ b/4
@ -1,12 +1,15 @@
 Contributors:
 - Aidan O'Loan (aidanol at gmail dot com)
 - Alan Browning (browning at google dot com)
+- Alexandru Ardelean (ardeleanalex at gmail dot com)
+- Brian Ledger (brianpl at google dot com)
 - Charles Munger (clm at google dot com)
 - Cheng Yi (cyi at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
 - Christopher Degawa (ccom at randomderp dot com)
 - Clement Courbet (courbet at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
+- Frank Barchard (fbarchard at google dot com)
 - Hui Su (huisu at google dot com)
 - Ilya Kurdyukov (jpegqs at gmail dot com)
 - Ingvar Stepanyan (rreverser at google dot com)
@ -22,6 +25,7 @@ Contributors:
 - Mans Rullgard (mans at mansr dot com)
 - Marcin Kowalczyk (qrczak at google dot com)
 - Martin Olsson (mnemo at minimum dot se)
+- Maryla Ustarroz-Calonge (maryla at google dot com)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Mislav Bradac (mislavm at google dot com)
 - Nico Weber (thakis at chromium dot org)
--- a/Android.mk
+++ b/Android.mk
@ -33,6 +33,14 @@ else
  NEON := c
 endif

+sharpyuv_srcs := \
+    sharpyuv/sharpyuv.c \
+    sharpyuv/sharpyuv_csp.c \
+    sharpyuv/sharpyuv_dsp.c \
+    sharpyuv/sharpyuv_gamma.c \
+    sharpyuv/sharpyuv_neon.$(NEON) \
+    sharpyuv/sharpyuv_sse2.c \
+
 dec_srcs := \
    src/dec/alpha_dec.c \
    src/dec/buffer_dec.c \
@ -204,6 +212,7 @@ endif  # ENABLE_SHARED=1
 include $(CLEAR_VARS)

 LOCAL_SRC_FILES := \
+    $(sharpyuv_srcs) \
    $(dsp_enc_srcs) \
    $(enc_srcs) \
    $(utils_enc_srcs) \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,6 +15,14 @@ endif()
 project(WebP C)

 # Options for coder / decoder executables.
+if(BUILD_SHARED_LIBS)
+  set(WEBP_LINK_STATIC_DEFAULT OFF)
+else()
+  set(WEBP_LINK_STATIC_DEFAULT ON)
+endif()
+option(WEBP_LINK_STATIC
+       "Link using static libraries. If OFF, use dynamic libraries."
+       ${WEBP_LINK_STATIC_DEFAULT})
 if(NOT EMSCRIPTEN)
  # Disable SIMD on Emscripten by default, as it's a new unstable Wasm feature.
  # Users can still explicitly opt-in to make a SIMD-enabled build.
@ -40,6 +48,18 @@ option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces."
 set(WEBP_BITTRACE "0" CACHE STRING "Bit trace mode (0=none, 1=bit, 2=bytes)")
 set_property(CACHE WEBP_BITTRACE PROPERTY STRINGS 0 1 2)

+if(WEBP_LINK_STATIC)
+  if(WIN32)
+    SET(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  else()
+    SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  # vwebp does not compile on Ubuntu with static libraries so disabling it for
+  # now.
+  set(WEBP_BUILD_VWEBP OFF)
+endif()
+
 # Option needed for handling Unicode file names on Windows.
 if(WIN32)
  option(WEBP_UNICODE "Build Unicode executables." ON)
@ -97,6 +117,8 @@ set(includedir "\$\{prefix\}/include")
 set(PTHREAD_LIBS ${CMAKE_THREAD_LIBS_INIT})
 set(INSTALLED_LIBRARIES)

+set(CMAKE_C_VISIBILITY_PRESET hidden)
+
 # ##############################################################################
 # Android only.
 if(ANDROID)
@ -211,6 +233,18 @@ function(libwebp_add_stub_file TARGET)
  target_sources(${TARGET} PRIVATE ${stub_source_file})
 endfunction()

+parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv "WEBP_SHARPYUV_SRCS"
+                  "")
+add_library(sharpyuv OBJECT ${WEBP_SHARPYUV_SRCS})
+target_include_directories(sharpyuv
+                            PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                            ${CMAKE_CURRENT_SOURCE_DIR})
+set_target_properties(
+  sharpyuv
+  PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv/sharpyuv.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv/sharpyuv_csp.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")
+
 if(MSVC)
  # avoid security warnings for e.g., fopen() used in the examples.
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
@ -275,6 +309,7 @@ target_include_directories(webputils
                           PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
                                   ${CMAKE_CURRENT_SOURCE_DIR})
 add_library(webp
+            $<TARGET_OBJECTS:sharpyuv>
            $<TARGET_OBJECTS:webpdecode>
            $<TARGET_OBJECTS:webpdsp>
            $<TARGET_OBJECTS:webpencode>
@ -297,7 +332,8 @@ ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")

 # Make sure the OBJECT libraries are built with position independent code (it is
 # not ON by default).
-set_target_properties(webpdecode
+set_target_properties(sharpyuv
+                      webpdecode
                      webpdspdecode
                      webputilsdecode
                      webpencode
@ -411,13 +447,14 @@ if(WEBP_BUILD_ANIM_UTILS
   OR WEBP_BUILD_DWEBP
   OR WEBP_BUILD_GIF2WEBP
   OR WEBP_BUILD_IMG2WEBP
-   OR WEBP_BUILD_VWEBP)
+   OR WEBP_BUILD_VWEBP
+   OR WEBP_BUILD_WEBPMUX
+   OR WEBP_BUILD_WEBPINFO)
  # Example utility library.
  parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "EXAMPLEUTIL_SRCS"
                    "example_util_[^ ]*")
  list(APPEND EXAMPLEUTIL_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
  add_library(exampleutil STATIC ${EXAMPLEUTIL_SRCS})
-  target_link_libraries(exampleutil imageioutil)
  target_include_directories(
    exampleutil
    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>)
@ -426,6 +463,7 @@ if(WEBP_BUILD_ANIM_UTILS
                    "imageio_util_[^ ]*")
  add_library(imageioutil STATIC ${IMAGEIOUTILS_SRCS})
  target_link_libraries(imageioutil webp)
+  target_link_libraries(exampleutil imageioutil)

  # Image-decoding utility library.
  parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEDEC_SRCS"
@ -471,19 +509,18 @@ endif()

 if(WEBP_BUILD_LIBWEBPMUX)
  parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "WEBP_MUX_SRCS" "")
-  add_library(libwebpmux ${WEBP_MUX_SRCS})
-  target_link_libraries(libwebpmux webp)
-  target_include_directories(libwebpmux
+  add_library(webpmux ${WEBP_MUX_SRCS})
+  target_link_libraries(webpmux webp)
+  target_include_directories(webpmux
                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
                                     ${CMAKE_CURRENT_SOURCE_DIR})
-  set_version(mux/Makefile.am libwebpmux webpmux)
-  set_target_properties(libwebpmux
+  set_version(mux/Makefile.am webpmux webpmux)
+  set_target_properties(webpmux
                        PROPERTIES PUBLIC_HEADER
                                   "${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h;\
 ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h;\
 ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h;")
-  set_target_properties(libwebpmux PROPERTIES OUTPUT_NAME webpmux)
-  list(APPEND INSTALLED_LIBRARIES libwebpmux)
+  list(APPEND INSTALLED_LIBRARIES webpmux)
  configure_pkg_config("src/mux/libwebpmux.pc")
 endif()

@ -497,7 +534,7 @@ if(WEBP_BUILD_GIF2WEBP)
                        exampleutil
                        imageioutil
                        webp
-                        libwebpmux
+                        webpmux
                        ${WEBP_DEP_GIF_LIBRARIES})
  target_include_directories(gif2webp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
  install(TARGETS gif2webp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
@ -514,7 +551,7 @@ if(WEBP_BUILD_IMG2WEBP)
                        imagedec
                        imageioutil
                        webp
-                        libwebpmux)
+                        webpmux)
  target_include_directories(img2webp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
  install(TARGETS img2webp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
@ -554,7 +591,9 @@ if(WEBP_BUILD_WEBPINFO)
                    "webpinfo")
  add_executable(webpinfo ${WEBPINFO_SRCS})
  target_link_libraries(webpinfo exampleutil imageioutil)
-  target_include_directories(webpinfo PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(webpinfo
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src
+                                     ${CMAKE_CURRENT_SOURCE_DIR}/src)
  install(TARGETS webpinfo RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

@ -562,10 +601,12 @@ if(WEBP_BUILD_WEBPMUX)
  # webpmux
  parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "WEBPMUX_SRCS"
                    "webpmux")
-  add_executable(webpmux ${WEBPMUX_SRCS})
-  target_link_libraries(webpmux exampleutil imageioutil libwebpmux webp)
-  target_include_directories(webpmux PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
-  install(TARGETS webpmux RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+  add_executable(webpmux_app ${WEBPMUX_SRCS})
+  set_target_properties(webpmux_app PROPERTIES OUTPUT_NAME webpmux)
+  target_link_libraries(webpmux_app exampleutil imageioutil webpmux webp)
+  target_include_directories(webpmux_app
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  install(TARGETS webpmux_app RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

 if(WEBP_BUILD_EXTRAS)
@ -575,6 +616,13 @@ if(WEBP_BUILD_EXTRAS)
  parse_makefile_am(${EXTRAS_MAKEFILE} "WEBP_QUALITY_SRCS" "webp_quality")
  parse_makefile_am(${EXTRAS_MAKEFILE} "VWEBP_SDL_SRCS" "vwebp_sdl")

+  # libextras
+  add_library(extras STATIC ${WEBP_EXTRAS_SRCS})
+  target_include_directories(extras
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                             ${CMAKE_CURRENT_SOURCE_DIR}
+                             ${CMAKE_CURRENT_SOURCE_DIR}/src)
+
  # get_disto
  add_executable(get_disto ${GET_DISTO_SRCS})
  target_link_libraries(get_disto imagedec)
@ -583,15 +631,15 @@ if(WEBP_BUILD_EXTRAS)
                                     ${CMAKE_CURRENT_BINARY_DIR}/src)

  # webp_quality
-  add_executable(webp_quality ${WEBP_QUALITY_SRCS} ${WEBP_EXTRAS_SRCS})
-  target_link_libraries(webp_quality exampleutil imagedec)
+  add_executable(webp_quality ${WEBP_QUALITY_SRCS})
+  target_link_libraries(webp_quality exampleutil imagedec extras)
  target_include_directories(webp_quality
                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
                                     ${CMAKE_CURRENT_BINARY_DIR})

  # vwebp_sdl
  find_package(SDL)
-  if(SDL_FOUND)
+  if(WEBP_BUILD_VWEBP AND SDL_FOUND)
    add_executable(vwebp_sdl ${VWEBP_SDL_SRCS})
    target_link_libraries(vwebp_sdl ${SDL_LIBRARY} imageioutil webp)
    target_include_directories(vwebp_sdl
--- a/132
+++ b/132
@ -1,3 +1,131 @@
+56a480e8 dsp/cpu.h: add missing extern "C"
+62b45bdd update ChangeLog (tag: v1.2.3-rc1)
+8764ec7a Merge changes Idb037953,Id582e395 into 1.2.3
+bcb872c3 vwebp: fix file name display in windows unicode build
+67c44ac5 webpmux: fix -frame option in windows unicode build
+8278825a makefile.unix: add sharpyuv objects to clean target
+14a49e01 update NEWS
+34b1dc33 bump version to 1.2.3
+0b397fda update AUTHORS
+c16488ac update .mailmap
+5a2d929c Merge "unicode.h: set console mode before using wprintf" into main
+169f867f unicode.h: set console mode before using wprintf
+a94b855c Merge "libsharpyuv: add version defines" into main
+f83bdb52 libsharpyuv: add version defines
+bef0d797 unicode_gif.h: fix -Wdeclaration-after-statement
+404c1622 Rename Huffman coding to prefix coding in the bitstream spec
+8895f8a3 Merge "run_static_analysis.sh: fix scan-build archive path" into main
+92a673d2 Merge "Add -fvisibility=hidden flag in CMakeLists." into main
+67c1d722 Merge "add WEBP_MSAN" into main
+1124ff66 Add -fvisibility=hidden flag in CMakeLists.
+e15b3560 add WEBP_MSAN
+ec9e782a sharpyuv: remove minimum image size from sharpyuv library
+7bd07f3b run_static_analysis.sh: fix scan-build archive path
+5ecee06f Merge "sharpyuv: increase precision of gamma<->linear conversion" into main
+f81dd7d6 Merge changes I3d17d529,I53026880,I1bd61639,I6bd4b25d,Icfec8fba into main
+2d607ee6 sharpyuv: increase precision of gamma<->linear conversion
+266cbbc5 sharpyuv: add 32bit version of SharpYuvFilterRow.
+9fc12274 CMake: add src to webpinfo includes
+7d18f40a CMake: add WEBP_BUILD_WEBPINFO to list of checks for exampleutil
+11309aa5 CMake: add WEBP_BUILD_WEBPMUX to list of checks for exampleutil
+4bc762f7 CMake: link imageioutil to exampleutil after defined
+0d1b9bc4 WEBP_DEP_LIBRARIES: use Threads::Threads
+20ef48f0 Merge "sharpyuv: add support for 10/12/16 bit rgb and 10/12 bit yuv." into main
+93c54371 sharpyuv: add support for 10/12/16 bit rgb and 10/12 bit yuv.
+53cf2b49 normalize WebPValidatePicture declaration w/definition
+d3006f4b sharpyuv: slightly improve precision
+ea967098 Merge changes Ia01bd397,Ibf3771af into main
+11bc8410 Merge changes I2d317c4b,I9e77f6db into main
+30453ea4 Add an internal WebPValidatePicture.
+6c43219a Some renamings for consistency.
+4f59fa73 update .mailmap
+e74f8a62 webp-lossless-bitstream-spec,cosmetics: normalize range syntax
+5a709ec0 webp-lossless-bitstream-spec,cosmetics: fix code typo
+a2093acc webp-lossless-bitstream-spec: add amendment note
+86c66930 webp-lossless-bitstream-spec: fix BNF
+232f22da webp-lossless-bitstream-spec: fix 'simple code' snippet
+44dd765d webp-lossless-bitstream-spec: fix ColorTransform impl
+7a7e33e9 webp-lossless-bitstream-spec: fix TR-pixel right border note
+86f94ee0 Update lossless spec with Huffman codes.
+a3927cc8 sharpyuv.c,cosmetics: fix indent
+6c45cef7 Make sure the stride has a minimum value in the importer.
+0c8b0e67 sharpyuv: cleanup/cosmetic changes
+dc3841e0 {histogram,predictor}_enc: quiet int -> float warnings
+a19a25bb Replace doubles by floats in lossless misc cost estimations.
+42888f6c Add an option to enable static builds.
+7efcf3cc Merge "Fix typo in color constants: Marix -> Matrix" into main
+8f4b5c62 Fix typo in color constants: Marix -> Matrix
+90084d84 Merge "demux,IsValidExtendedFormat: remove unused variable" into main
+ed643f61 Merge changes I452d2485,Ic6d75475 into main
+8fa053d1 Rename SharpYUV to SharpYuv for consistency.
+99a87562 SharpYuvComputeConversionMatrix: quiet int->float warnings
+deb426be Makefile.vc: add sharpyuv_csp.obj to SHARPYUV_OBJS
+779597d4 demux,IsValidExtendedFormat: remove unused variable
+40e8aa57 Merge "libsharpyuv: add colorspace utilities" into main
+01a05de1 libsharpyuv: add colorspace utilities
+2de4b05a Merge changes Id9890a60,I376d81e6,I1c958838 into main
+b8bca81f Merge "configure.ac: use LT_INIT if available" into main
+e8e77b9c Merge changes I479bc487,I39864691,I5d486c2c,I186d13be into main
+7e7d5d50 Merge ".gitignore: add Android Studio & VS code dirs" into main
+10c50848 normalize label indent
+89f774e6 mux{edit,internal}: fix leaks on error
+2d3293ad ExUtilInitCommandLineArguments: fix leak on error
+ec34fd70 anim_util: fix leaks on error
+e4717287 gif2webp: fix segfault on OOM
+e3cfafaf GetBackwardReferences: fail on alloc error
+a828a59b BackwardReferencesHashChainDistanceOnly: fix segfault on OOM
+fe153fae VP8LEncodeStream: fix segfault on OOM
+919acc0e .gitignore: add Android Studio & VS code dirs
+efa0731b configure.ac: use LT_INIT if available
+0957fd69 tiffdec: add grayscale support
+e685feef Merge "Make libsharpyuv self-contained by removing dependency on cpu.c" into main
+841960b6 Make libsharpyuv self-contained by removing dependency on cpu.c
+617cf036 image_dec: add WebPGetEnabledInputFileFormats()
+7a68afaa Let SharpArgbToYuv caller pass in an RGB>YUV conversion matrix.
+34bb332c man/cwebp.1: add note about crop/resize order
+f0e9351c webp-lossless-bitstream-spec,cosmetics: fix some typos
+5ccbd6ed vp8l_dec.c,cosmetics: fix a few typos
+c3d0c2d7 fix ios build scripts after sharpyuv dep added
+d0d2292e Merge "Make libwebp depend on libsharpyuv." into main
+03d12190 alpha_processing_neon.c: fix 0x01... typo
+d55d447c Make libwebp depend on libsharpyuv.
+e4cbcdd2 Fix lossless encoding for MIPS.
+924e7ca6 alpha_processing_neon.c: fix Dispatch/ExtractAlpha_NEON
+0fa0ea54 Makefile.vc: use /MANIFEST:EMBED
+29cc95ce Basic version of libsharpyuv in libwebp, in C.
+a30f2190 examples/webpmux.c: fix a couple of typos
+66b3ce23 Fix bad overflow check in ReadTIFF()
+54e61a38 Markdownify libwebp docs and reorganize them.
+b4533deb CMakeLists.txt,cosmetics: break long line
+b9d2f9cd quant_enc.c: use WEBP_RESTRICT qualifier
+ec178f2c Add progress hook granularity in lossless
+26139c73 Rename MAX_COST to MAX_BIT_COST in histogram_enc.c
+13b82816 cmake: fix webpmux lib name for cmake linking
+88b6a396 webp-container-spec.txt,cosmetics: normalize formatting
+6f496540 Merge tag 'v1.2.2'
+4074acf8 dsp.h: bump msvc arm64 version requirement to 16.6
+b0a86089 update ChangeLog (tag: v1.2.2)
+6db8248c libwebp: Fix VP8EncTokenLoop() progress
+827a307f BMP enc: fix the transparency case
+db25f1b4 libwebp: Fix VP8EncTokenLoop() progress
+286e7fce libwebp: do not destroy jpeg codec twice on error
+6e8a4126 libwebp: do not destroy jpeg codec twice on error
+faf21968 Merge "BMP enc: fix the transparency case" into main
+480cd51d BMP enc: fix the transparency case
+9195ea05 update ChangeLog (tag: v1.2.2-rc2)
+4acae017 update NEWS
+883f0633 man/img2webp.1: update date
+567e1f44 Reword img2webp synopsis command line
+1b0c15db man/img2webp.1: update date
+17bade38 Merge "Reword img2webp synopsis command line" into main
+a80954a1 Reword img2webp synopsis command line
+f084244d anim_decode: fix alpha blending with big-endian
+b217b4ff webpinfo: fix fourcc comparison w/big-endian
+ec497b75 Merge "anim_decode: fix alpha blending with big-endian" into main
+e4886716 anim_decode: fix alpha blending with big-endian
+e3cb052c webpinfo: fix fourcc comparison w/big-endian
+a510fedb patch-check: detect duplicated files
+f035d2e4 update ChangeLog (tag: v1.2.2-rc1)
 7031946a update NEWS
 973390b6 bump version to 1.2.2
 abd6664f update AUTHORS
@ -35,7 +163,7 @@ e23cd548 dsp.h: enable NEON w/VS2019+ ARM64 targets
 42592af8 webp,cmake: Remove unnecessary include dirs
 e298e05f Add patch-check steps in PRESUBMIT.py
 29148919 Merge tag 'v1.2.1'
-9ce5843d update ChangeLog (tag: v1.2.1, origin/1.2.1)
+9ce5843d update ChangeLog (tag: v1.2.1)
 d9191588 fuzzer/*: normalize src/ includes
 c5bc3624 fuzzer/*: normalize src/ includes
 53b6f762 fix indent
@ -214,7 +342,7 @@ a99078c1 remove call to MBAnalyzeBestIntra4Mode for method >= 5
 6a0ff358 Enc: add a qmin / qmax range for quality factor
 0fa56f30 Merge tag 'v1.1.0'
 6cf504d0 PNM decoding: handle max_value != 255
-d7844e97 update ChangeLog (tag: v1.1.0-rc2, tag: v1.1.0, origin/1.1.0)
+d7844e97 update ChangeLog (tag: v1.1.0-rc2, tag: v1.1.0)
 7f006436 Makefile.vc: fix webp_quality.exe link
 cf047e83 Makefile.vc: fix webp_quality.exe link
 c074c653 update NEWS
--- a/Makefile.am
+++ b/Makefile.am
@ -1,5 +1,5 @@
 ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = src imageio man
+SUBDIRS = sharpyuv src imageio man
 EXTRA_DIST = COPYING autogen.sh

 if BUILD_EXTRAS
--- a/Makefile.vc
+++ b/Makefile.vc
@ -31,12 +31,11 @@ CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Zi /D_DEBUG /RTC1
 CFLAGS     = /I. /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
-LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
+LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST:EMBED /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
 LNKDLL     = link.exe /DLL $(NOLOGO)
 LNKEXE     = link.exe $(NOLOGO)
 LNKLIB     = lib.exe $(NOLOGO)
-MT         = mt.exe $(NOLOGO)
 RCNODBG    = rc.exe $(NOLOGO) /l"0x0409"  # 0x409 = U.S. English
 RCDEBUG    = $(RCNODBG) /D_DEBUG

@ -82,6 +81,7 @@ OUTPUT_DIRS = $(DIRBIN) $(DIRINC) $(DIRLIB) \
              $(DIROBJ)\extras \
              $(DIROBJ)\imageio \
              $(DIROBJ)\mux \
+              $(DIROBJ)\sharpyuv \
              $(DIROBJ)\utils \

 # Target configuration
@ -174,6 +174,14 @@ CFLAGS = $(CFLAGS) /D_UNICODE /DUNICODE
 # A config was provided, so the library can be built.
 #

+SHARPYUV_OBJS = \
+    $(DIROBJ)\sharpyuv\sharpyuv.obj \
+    $(DIROBJ)\sharpyuv\sharpyuv_csp.obj \
+    $(DIROBJ)\sharpyuv\sharpyuv_dsp.obj \
+    $(DIROBJ)\sharpyuv\sharpyuv_gamma.obj \
+    $(DIROBJ)\sharpyuv\sharpyuv_neon.obj \
+    $(DIROBJ)\sharpyuv\sharpyuv_sse2.obj \
+
 DEC_OBJS = \
    $(DIROBJ)\dec\alpha_dec.obj \
    $(DIROBJ)\dec\buffer_dec.obj \
@ -335,8 +343,8 @@ UTILS_ENC_OBJS = \
    $(DIROBJ)\utils\quant_levels_utils.obj \

 LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
-LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
-               $(UTILS_ENC_OBJS) $(DLL_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(SHARPYUV_OBJS) $(ENC_OBJS) \
+               $(DSP_ENC_OBJS) $(UTILS_ENC_OBJS) $(DLL_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
 LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)

@ -482,6 +490,8 @@ $(DIROBJ)\examples\gifdec.obj: examples\gifdec.c
 	$(CC) $(CFLAGS) /Fd$(DIROBJ)\extras\ /Fo$(DIROBJ)\extras\ $<
 {imageio}.c{$(DIROBJ)\imageio}.obj::
 	$(CC) $(CFLAGS) /Fd$(DIROBJ)\imageio\ /Fo$(DIROBJ)\imageio\ $<
+{sharpyuv}.c{$(DIROBJ)\sharpyuv}.obj::
+	$(CC) $(CFLAGS) /Fd$(DIROBJ)\sharpyuv\ /Fo$(DIROBJ)\sharpyuv\ $<
 {src\dec}.c{$(DIROBJ)\dec}.obj::
 	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dec\ $<
 {src\demux}.c{$(DIROBJ)\demux}.obj::
@ -502,13 +512,9 @@ LNKLIBS     = $(LNKLIBS) Shell32.lib

 {$(DIROBJ)\examples}.obj{$(DIRBIN)}.exe:
 	$(LNKEXE) $(LDFLAGS) /OUT:$@ $** $(LNKLIBS)
-	$(MT) -manifest $@.manifest -outputresource:$@;1
-	del $@.manifest

 {$(DIROBJ)\extras}.obj{$(DIRBIN)}.exe:
 	$(LNKEXE) $(LDFLAGS) /OUT:$@ $** $(LNKLIBS)
-	$(MT) -manifest $@.manifest -outputresource:$@;1
-	del $@.manifest

 clean::
 	@-erase /s $(DIROBJ)\*.dll 2> NUL
--- a/12
+++ b/12
@ -1,10 +1,18 @@
- 12/15/2021: version 1.2.2
+- 6/30/2022: version 1.2.3
+  This is a binary compatible release.
+  * security fix for lossless encoder (#565, chromium:1313709)
+  * improved progress granularity in WebPReportProgress() when using lossless
+  * improved precision in Sharp YUV (-sharp_yuv) conversion
+  * many corrections to webp-lossless-bitstream-spec.txt (#551)
+  * crash/leak fixes on error/OOM and other bug fixes (#558, #563, #569, #573)
+
+- 1/11/2022: version 1.2.2
  This is a binary compatible release.
  * webpmux: add "-set bgcolor A,R,G,B"
  * add ARM64 NEON support for MSVC builds (#539)
  * fix duplicate include error in Xcode when using multiple XCFrameworks in a
    project (#542)
-  * doc updates and bug fixes (#538, #544)
+  * doc updates and bug fixes (#538, #544, #548, #550)

 - 7/20/2021: version 1.2.1
  This is a binary compatible release.
--- a/PRESUBMIT.py
+++ b/PRESUBMIT.py
@ -41,6 +41,7 @@ _BASH_INDENTATION = "2"
 _GIT_COMMIT_SUBJECT_LENGTH = 65
 _INCLUDE_BASH_FILES_ONLY = [r".*\.sh$"]
 _INCLUDE_MAN_FILES_ONLY = [r"man/.+\.1$"]
+_INCLUDE_SOURCE_FILES_ONLY = [r".*\.[ch]$"]
 _LIBWEBP_MAX_LINE_LENGTH = 80


@ -72,6 +73,35 @@ def _CheckCommitSubjectLength(input_api, output_api):
  return output_api.PresubmitResult("%s\n (%4.2fs) success" % (name, duration))


+def _CheckDuplicateFiles(input_api, output_api):
+  """Ensures there are not repeated filenames."""
+  all_files = []
+  for f in input_api.change.AllFiles():
+    for include_file in _INCLUDE_SOURCE_FILES_ONLY:
+      if re.match(include_file, f):
+        all_files.append(f)
+        break
+
+  basename_to_path = {}
+  for f in all_files:
+    basename_file = input_api.basename(f)
+    if basename_file in basename_to_path:
+      basename_to_path[basename_file].append(f)
+    else:
+      basename_to_path[basename_file] = [f]
+
+  dupes = []
+  for files in basename_to_path.values():
+    if len(files) > 1:
+      dupes.extend(files)
+
+  if dupes:
+    return output_api.PresubmitError(
+        "Duplicate source files, rebase or rename some to make them unique:\n%s"
+        % dupes)
+  return output_api.PresubmitResult("No duplicates, success\n")
+
+
 def _GetFilesToSkip(input_api):
  return list(input_api.DEFAULT_FILES_TO_SKIP) + [
      r"swig/.*\.py$",
@ -154,6 +184,7 @@ def _CommonChecks(input_api, output_api):
      input_api.canned_checks.CheckChangeHasNoStrayWhitespace(
          input_api, output_api))
  results.append(_CheckCommitSubjectLength(input_api, output_api))
+  results.append(_CheckDuplicateFiles(input_api, output_api))

  source_file_filter = lambda x: input_api.FilterSourceFile(
      x, files_to_skip=_GetFilesToSkip(input_api))
--- a/795
+++ b/795
@ -1,795 +0,0 @@
-          __   __  ____  ____  ____
-         /  \\/  \/  _ \/  _ )/  _ \
-         \       /   __/  _  \   __/
-          \__\__/\____/\_____/__/ ____  ___
-                / _/ /    \    \ /  _ \/ _/
-               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v1.2.2
-
-Description:
-============
-
-WebP codec: library to encode and decode images in WebP format. This package
-contains the library that can be used in other programs to add WebP support,
-as well as the command line tools 'cwebp' and 'dwebp'.
-
-See https://developers.google.com/speed/webp
-
-The latest source tree is available at
-https://chromium.googlesource.com/webm/libwebp
-
-It is released under the same license as the WebM project.
-See https://www.webmproject.org/license/software/ or the
-"COPYING" file for details. An additional intellectual
-property rights grant can be found in the file PATENTS.
-
-Building:
-=========
-
-Windows build:
--------------
-
-By running:
-
-  nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output
-
-the directory output\release-static\(x64|x86)\bin will contain the tools
-cwebp.exe and dwebp.exe. The directory output\release-static\(x64|x86)\lib will
-contain the libwebp static library.
-The target architecture (x86/x64) is detected by Makefile.vc from the Visual
-Studio compiler (cl.exe) available in the system path.
-
-Unix build using makefile.unix:
-------------------------------
-
-On platforms with GNU tools installed (gcc and make), running
-
-  make -f makefile.unix
-
-will build the binaries examples/cwebp and examples/dwebp, along
-with the static library src/libwebp.a. No system-wide installation
-is supplied, as this is a simple alternative to the full installation
-system based on the autoconf tools (see below).
-Please refer to makefile.unix for additional details and customizations.
-
-Using autoconf tools:
---------------------
-Prerequisites:
-A compiler (e.g., gcc), make, autoconf, automake, libtool.
-On a Debian-like system the following should install everything you need for a
-minimal build:
-$ sudo apt-get install gcc make autoconf automake libtool
-
-When building from git sources, you will need to run autogen.sh to generate the
-configure script.
-
-./configure
-make
-make install
-
-should be all you need to have the following files
-
-/usr/local/include/webp/decode.h
-/usr/local/include/webp/encode.h
-/usr/local/include/webp/types.h
-/usr/local/lib/libwebp.*
-/usr/local/bin/cwebp
-/usr/local/bin/dwebp
-
-installed.
-
-Note: A decode-only library, libwebpdecoder, is available using the
-'--enable-libwebpdecoder' flag. The encode library is built separately and can
-be installed independently using a minor modification in the corresponding
-Makefile.am configure files (see comments there). See './configure --help' for
-more options.
-
-Building for MIPS Linux:
------------------------
-MIPS Linux toolchain stable available releases can be found at:
-https://community.imgtec.com/developers/mips/tools/codescape-mips-sdk/available-releases/
-
-# Add toolchain to PATH
-export PATH=$PATH:/path/to/toolchain/bin
-
-# 32-bit build for mips32r5 (p5600)
-HOST=mips-mti-linux-gnu
-MIPS_CFLAGS="-O3 -mips32r5 -mabi=32 -mtune=p5600 -mmsa -mfp64 \
-  -msched-weight -mload-store-pairs -fPIE"
-MIPS_LDFLAGS="-mips32r5 -mabi=32 -mmsa -mfp64 -pie"
-
-# 64-bit build for mips64r6 (i6400)
-HOST=mips-img-linux-gnu
-MIPS_CFLAGS="-O3 -mips64r6 -mabi=64 -mtune=i6400 -mmsa -mfp64 \
-  -msched-weight -mload-store-pairs -fPIE"
-MIPS_LDFLAGS="-mips64r6 -mabi=64 -mmsa -mfp64 -pie"
-
-./configure --host=${HOST} --build=`config.guess` \
-  CC="${HOST}-gcc -EL" \
-  CFLAGS="$MIPS_CFLAGS" \
-  LDFLAGS="$MIPS_LDFLAGS"
-make
-make install
-
-CMake:
------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
-and the JS bindings.
-
-Prerequisites:
-A compiler (e.g., gcc with autotools) and CMake.
-On a Debian-like system the following should install everything you need for a
-minimal build:
-$ sudo apt-get install build-essential cmake
-
-When building from git sources, you will need to run cmake to generate the
-makefiles.
-
-mkdir build && cd build && cmake ../
-make
-make install
-
-If you also want any of the executables, you will need to enable them through
-CMake, e.g.:
-
-cmake -DWEBP_BUILD_CWEBP=ON -DWEBP_BUILD_DWEBP=ON ../
-
-or through your favorite interface (like ccmake or cmake-qt-gui).
-
-Use option -DWEBP_UNICODE=ON for Unicode support on Windows (with chcp 65001).
-
-Finally, once installed, you can also use WebP in your CMake project by doing:
-
-find_package(WebP)
-
-which will define the CMake variables WebP_INCLUDE_DIRS and WebP_LIBRARIES.
-
-Gradle:
-------
-The support for Gradle is minimal: it only helps you compile libwebp, cwebp and
-dwebp and webpmux_example.
-
-Prerequisites:
-A compiler (e.g., gcc with autotools) and gradle.
-On a Debian-like system the following should install everything you need for a
-minimal build:
-$ sudo apt-get install build-essential gradle
-
-When building from git sources, you will need to run the Gradle wrapper with the
-appropriate target, e.g. :
-
-./gradlew buildAllExecutables
-
-SWIG bindings:
--------------
-
-To generate language bindings from swig/libwebp.swig at least swig-1.3
-(http://www.swig.org) is required.
-
-Currently the following functions are mapped:
-Decode:
-  WebPGetDecoderVersion
-  WebPGetInfo
-  WebPDecodeRGBA
-  WebPDecodeARGB
-  WebPDecodeBGRA
-  WebPDecodeBGR
-  WebPDecodeRGB
-
-Encode:
-  WebPGetEncoderVersion
-  WebPEncodeRGBA
-  WebPEncodeBGRA
-  WebPEncodeRGB
-  WebPEncodeBGR
-  WebPEncodeLosslessRGBA
-  WebPEncodeLosslessBGRA
-  WebPEncodeLosslessRGB
-  WebPEncodeLosslessBGR
-
-See swig/README for more detailed build instructions.
-
-Java bindings:
-
-To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
-is necessary for enum support. The output is intended to be a shared object /
-DLL that can be loaded via System.loadLibrary("webp_jni").
-
-Python bindings:
-
-To build the swig-generated Python extension code at least Python 2.6 is
-required. Python < 2.6 may build with some minor changes to libwebp.swig or the
-generated code, but is untested.
-
-Encoding tool:
-==============
-
-The examples/ directory contains tools for encoding (cwebp) and
-decoding (dwebp) images.
-
-The easiest use should look like:
-  cwebp input.png -q 80 -o output.webp
-which will convert the input file to a WebP file using a quality factor of 80
-on a 0->100 scale (0 being the lowest quality, 100 being the best. Default
-value is 75).
-You might want to try the -lossless flag too, which will compress the source
-(in RGBA format) without any loss. The -q quality parameter will in this case
-control the amount of processing time spent trying to make the output file as
-small as possible.
-
-A longer list of options is available using the -longhelp command line flag:
-
-> cwebp -longhelp
-Usage:
- cwebp [-preset <...>] [options] in_file [-o out_file]
-
-If input size (-s) for an image is not specified, it is
-assumed to be a PNG, JPEG, TIFF or WebP file.
-Note: Animated PNG and WebP files are not supported.
-
-Options:
-  -h / -help ............. short help
-  -H / -longhelp ......... long help
-  -q <float> ............. quality factor (0:small..100:big), default=75
-  -alpha_q <int> ......... transparency-compression quality (0..100),
-                           default=100
-  -preset <string> ....... preset setting, one of:
-                            default, photo, picture,
-                            drawing, icon, text
-     -preset must come first, as it overwrites other parameters
-  -z <int> ............... activates lossless preset with given
-                           level in [0:fast, ..., 9:slowest]
-
-  -m <int> ............... compression method (0=fast, 6=slowest), default=4
-  -segments <int> ........ number of segments to use (1..4), default=4
-  -size <int> ............ target size (in bytes)
-  -psnr <float> .......... target PSNR (in dB. typically: 42)
-
-  -s <int> <int> ......... input size (width x height) for YUV
-  -sns <int> ............. spatial noise shaping (0:off, 100:max), default=50
-  -f <int> ............... filter strength (0=off..100), default=60
-  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp), default=0
-  -strong ................ use strong filter instead of simple (default)
-  -nostrong .............. use simple filter instead of strong
-  -sharp_yuv ............. use sharper (and slower) RGB->YUV conversion
-  -partition_limit <int> . limit quality to fit the 512k limit on
-                           the first partition (0=no degradation ... 100=full)
-  -pass <int> ............ analysis pass number (1..10)
-  -qrange <min> <max> .... specifies the permissible quality range
-                           (default: 0 100)
-  -crop <x> <y> <w> <h> .. crop picture with the given rectangle
-  -resize <w> <h> ........ resize picture (after any cropping)
-  -mt .................... use multi-threading if available
-  -low_memory ............ reduce memory usage (slower encoding)
-  -map <int> ............. print map of extra info
-  -print_psnr ............ prints averaged PSNR distortion
-  -print_ssim ............ prints averaged SSIM distortion
-  -print_lsim ............ prints local-similarity distortion
-  -d <file.pgm> .......... dump the compressed output (PGM file)
-  -alpha_method <int> .... transparency-compression method (0..1), default=1
-  -alpha_filter <string> . predictive filtering for alpha plane,
-                           one of: none, fast (default) or best
-  -exact ................. preserve RGB values in transparent area, default=off
-  -blend_alpha <hex> ..... blend colors against background color
-                           expressed as RGB values written in
-                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
-                           green=0xe0 and blue=0xd0
-  -noalpha ............... discard any transparency information
-  -lossless .............. encode image losslessly, default=off
-  -near_lossless <int> ... use near-lossless image
-                           preprocessing (0..100=off), default=100
-  -hint <string> ......... specify image characteristics hint,
-                           one of: photo, picture or graph
-
-  -metadata <string> ..... comma separated list of metadata to
-                           copy from the input to the output if present.
-                           Valid values: all, none (default), exif, icc, xmp
-
-  -short ................. condense printed message
-  -quiet ................. don't print anything
-  -version ............... print version number and exit
-  -noasm ................. disable all assembly optimizations
-  -v ..................... verbose, e.g. print encoding/decoding times
-  -progress .............. report encoding progress
-
-Experimental Options:
-  -jpeg_like ............. roughly match expected JPEG size
-  -af .................... auto-adjust filter strength
-  -pre <int> ............. pre-processing filter
-
-
-The main options you might want to try in order to further tune the
-visual quality are:
- -preset
- -sns
- -f
- -m
-
-Namely:
-  * 'preset' will set up a default encoding configuration targeting a
-     particular type of input. It should appear first in the list of options,
-     so that subsequent options can take effect on top of this preset.
-     Default value is 'default'.
-  * 'sns' will progressively turn on (when going from 0 to 100) some additional
-     visual optimizations (like: segmentation map re-enforcement). This option
-     will balance the bit allocation differently. It tries to take bits from the
-     "easy" parts of the picture and use them in the "difficult" ones instead.
-     Usually, raising the sns value (at fixed -q value) leads to larger files,
-     but with better quality.
-     Typical value is around '75'.
-  * 'f' option directly links to the filtering strength used by the codec's
-     in-loop processing. The higher the value, the smoother the
-     highly-compressed area will look. This is particularly useful when aiming
-     at very small files. Typical values are around 20-30. Note that using the
-     option -strong/-nostrong will change the type of filtering. Use "-f 0" to
-     turn filtering off.
-  * 'm' controls the trade-off between encoding speed and quality. Default is 4.
-     You can try -m 5 or -m 6 to explore more (time-consuming) encoding
-     possibilities. A lower value will result in faster encoding at the expense
-     of quality.
-
-Decoding tool:
-==============
-
-There is a decoding sample in examples/dwebp.c which will take
-a .webp file and decode it to a PNG image file (amongst other formats).
-This is simply to demonstrate the use of the API. You can verify the
-file test.webp decodes to exactly the same as test_ref.ppm by using:
-
- cd examples
- ./dwebp test.webp -ppm -o test.ppm
- diff test.ppm test_ref.ppm
-
-The full list of options is available using -h:
-
-> dwebp -h
-Usage: dwebp in_file [options] [-o out_file]
-
-Decodes the WebP image file to PNG format [Default].
-Note: Animated WebP files are not supported.
-
-Use following options to convert into alternate image formats:
-  -pam ......... save the raw RGBA samples as a color PAM
-  -ppm ......... save the raw RGB samples as a color PPM
-  -bmp ......... save as uncompressed BMP format
-  -tiff ........ save as uncompressed TIFF format
-  -pgm ......... save the raw YUV samples as a grayscale PGM
-                 file with IMC4 layout
-  -yuv ......... save the raw YUV samples in flat layout
-
- Other options are:
-  -version ..... print version number and exit
-  -nofancy ..... don't use the fancy YUV420 upscaler
-  -nofilter .... disable in-loop filtering
-  -nodither .... disable dithering
-  -dither <d> .. dithering strength (in 0..100)
-  -alpha_dither  use alpha-plane dithering if needed
-  -mt .......... use multi-threading
-  -crop <x> <y> <w> <h> ... crop output with the given rectangle
-  -resize <w> <h> ......... scale the output (*after* any cropping)
-  -flip ........ flip the output vertically
-  -alpha ....... only save the alpha plane
-  -incremental . use incremental decoding (useful for tests)
-  -h ........... this help message
-  -v ........... verbose (e.g. print encoding/decoding times)
-  -quiet ....... quiet mode, don't print anything
-  -noasm ....... disable all assembly optimizations
-
-WebP file analysis tool:
-========================
-
-'webpinfo' can be used to print out the chunk level structure and bitstream
-header information of WebP files. It can also check if the files are of valid
-WebP format.
-
-Usage: webpinfo [options] in_files
-Note: there could be multiple input files;
-      options must come before input files.
-Options:
-  -version ........... Print version number and exit.
-  -quiet ............. Do not show chunk parsing information.
-  -diag .............. Show parsing error diagnosis.
-  -summary ........... Show chunk stats summary.
-  -bitstream_info .... Parse bitstream header.
-
-Visualization tool:
-===================
-
-There's a little self-serve visualization tool called 'vwebp' under the
-examples/ directory. It uses OpenGL to open a simple drawing window and show
-a decoded WebP file. It's not yet integrated in the automake build system, but
-you can try to manually compile it using the recommendations below.
-
-Usage: vwebp in_file [options]
-
-Decodes the WebP image file and visualize it using OpenGL
-Options are:
-  -version ..... print version number and exit
-  -noicc ....... don't use the icc profile if present
-  -nofancy ..... don't use the fancy YUV420 upscaler
-  -nofilter .... disable in-loop filtering
-  -dither <int>  dithering strength (0..100), default=50
-  -noalphadither disable alpha plane dithering
-  -usebgcolor .. display background color
-  -mt .......... use multi-threading
-  -info ........ print info
-  -h ........... this help message
-
-Keyboard shortcuts:
-  'c' ................ toggle use of color profile
-  'b' ................ toggle background color display
-  'i' ................ overlay file information
-  'd' ................ disable blending & disposal (debug)
-  'q' / 'Q' / ESC .... quit
-
-Building:
---------
-
-Prerequisites:
-1) OpenGL & OpenGL Utility Toolkit (GLUT)
-  Linux:
-    $ sudo apt-get install freeglut3-dev mesa-common-dev
-  Mac + Xcode:
-    - These libraries should be available in the OpenGL / GLUT frameworks.
-  Windows:
-    http://freeglut.sourceforge.net/index.php#download
-
-2) (Optional) qcms (Quick Color Management System)
-  i. Download qcms from Mozilla / Chromium:
-    https://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
-    https://source.chromium.org/chromium/chromium/src/+/main:third_party/qcms/;drc=d4a2f8e1ed461d8fc05ed88d1ae2dc94c9773825
-  ii. Build and archive the source files as libqcms.a / qcms.lib
-  iii. Update makefile.unix / Makefile.vc
-    a) Define WEBP_HAVE_QCMS
-    b) Update include / library paths to reference the qcms directory.
-
-Build using makefile.unix / Makefile.vc:
-$ make -f makefile.unix examples/vwebp
-> nmake /f Makefile.vc CFG=release-static \
-    ../obj/x64/release-static/bin/vwebp.exe
-
-Animation creation tool:
-========================
-The utility 'img2webp' can turn a sequence of input images (PNG, JPEG, ...)
-into an animated WebP file. It offers fine control over duration, encoding
-modes, etc.
-
-Usage:
-
-  img2webp [file-level options] [image files...] [per-frame options...]
-
-File-level options (only used at the start of compression):
- -min_size ............ minimize size
- -loop <int> .......... loop count (default: 0, = infinite loop)
- -kmax <int> .......... maximum number of frame between key-frames
-                        (0=only keyframes)
- -kmin <int> .......... minimum number of frame between key-frames
-                        (0=disable key-frames altogether)
- -mixed ............... use mixed lossy/lossless automatic mode
- -v ................... verbose mode
- -h ................... this help
- -version ............. print version number and exit
-
-Per-frame options (only used for subsequent images input):
- -d <int> ............. frame duration in ms (default: 100)
- -lossless  ........... use lossless mode (default)
- -lossy ... ........... use lossy mode
- -q <float> ........... quality
- -m <int> ............. method to use
-
-example: img2webp -loop 2 in0.png -lossy in1.jpg
-                  -d 80 in2.tiff -o out.webp
-
-Note: if a single file name is passed as the argument, the arguments will be
-tokenized from this file. The file name must not start with the character '-'.
-
-Animated GIF conversion:
-========================
-Animated GIF files can be converted to WebP files with animation using the
-gif2webp utility available under examples/. The files can then be viewed using
-vwebp.
-
-Usage:
- gif2webp [options] gif_file -o webp_file
-Options:
-  -h / -help ............. this help
-  -lossy ................. encode image using lossy compression
-  -mixed ................. for each frame in the image, pick lossy
-                           or lossless compression heuristically
-  -q <float> ............. quality factor (0:small..100:big)
-  -m <int> ............... compression method (0=fast, 6=slowest)
-  -min_size .............. minimize output size (default:off)
-                           lossless compression by default; can be
-                           combined with -q, -m, -lossy or -mixed
-                           options
-  -kmin <int> ............ min distance between key frames
-  -kmax <int> ............ max distance between key frames
-  -f <int> ............... filter strength (0=off..100)
-  -metadata <string> ..... comma separated list of metadata to
-                           copy from the input to the output if present
-                           Valid values: all, none, icc, xmp (default)
-  -loop_compatibility .... use compatibility mode for Chrome
-                           version prior to M62 (inclusive)
-  -mt .................... use multi-threading if available
-
-  -version ............... print version number and exit
-  -v ..................... verbose
-  -quiet ................. don't print anything
-
-Building:
---------
-With the libgif development files installed, gif2webp can be built using
-makefile.unix:
-$ make -f makefile.unix examples/gif2webp
-
-or using autoconf:
-$ ./configure --enable-everything
-$ make
-
-Comparison of animated images:
-==============================
-Test utility anim_diff under examples/ can be used to compare two animated
-images (each can be GIF or WebP).
-
-Usage: anim_diff <image1> <image2> [options]
-
-Options:
-  -dump_frames <folder> dump decoded frames in PAM format
-  -min_psnr <float> ... minimum per-frame PSNR
-  -raw_comparison ..... if this flag is not used, RGB is
-                        premultiplied before comparison
-  -max_diff <int> ..... maximum allowed difference per channel
-                        between corresponding pixels in subsequent
-                        frames
-  -h .................. this help
-  -version ............ print version number and exit
-
-Building:
---------
-With the libgif development files and a C++ compiler installed, anim_diff can
-be built using makefile.unix:
-$ make -f makefile.unix examples/anim_diff
-
-or using autoconf:
-$ ./configure --enable-everything
-$ make
-
-Encoding API:
-=============
-
-The main encoding functions are available in the header src/webp/encode.h
-The ready-to-use ones are:
-size_t WebPEncodeRGB(const uint8_t* rgb, int width, int height, int stride,
-                     float quality_factor, uint8_t** output);
-size_t WebPEncodeBGR(const uint8_t* bgr, int width, int height, int stride,
-                     float quality_factor, uint8_t** output);
-size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride,
-                      float quality_factor, uint8_t** output);
-size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
-                      float quality_factor, uint8_t** output);
-
-They will convert raw RGB samples to a WebP data. The only control supplied
-is the quality factor.
-
-There are some variants for using the lossless format:
-
-size_t WebPEncodeLosslessRGB(const uint8_t* rgb, int width, int height,
-                             int stride, uint8_t** output);
-size_t WebPEncodeLosslessBGR(const uint8_t* bgr, int width, int height,
-                             int stride, uint8_t** output);
-size_t WebPEncodeLosslessRGBA(const uint8_t* rgba, int width, int height,
-                              int stride, uint8_t** output);
-size_t WebPEncodeLosslessBGRA(const uint8_t* bgra, int width, int height,
-                              int stride, uint8_t** output);
-
-Of course in this case, no quality factor is needed since the compression
-occurs without loss of the input values, at the expense of larger output sizes.
-
-Advanced encoding API:
----------------------
-
-A more advanced API is based on the WebPConfig and WebPPicture structures.
-
-WebPConfig contains the encoding settings and is not tied to a particular
-picture.
-WebPPicture contains input data, on which some WebPConfig will be used for
-compression.
-The encoding flow looks like:
-
-------------------------------------- BEGIN PSEUDO EXAMPLE
-
-#include <webp/encode.h>
-
-  // Setup a config, starting form a preset and tuning some additional
-  // parameters
-  WebPConfig config;
-  if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) {
-    return 0;   // version error
-  }
-  // ... additional tuning
-  config.sns_strength = 90;
-  config.filter_sharpness = 6;
-  config_error = WebPValidateConfig(&config);  // not mandatory, but useful
-
-  // Setup the input data
-  WebPPicture pic;
-  if (!WebPPictureInit(&pic)) {
-    return 0;  // version error
-  }
-  pic.width = width;
-  pic.height = height;
-  // allocated picture of dimension width x height
-  if (!WebPPictureAlloc(&pic)) {
-    return 0;   // memory error
-  }
-  // at this point, 'pic' has been initialized as a container,
-  // and can receive the Y/U/V samples.
-  // Alternatively, one could use ready-made import functions like
-  // WebPPictureImportRGB(), which will take care of memory allocation.
-  // In any case, past this point, one will have to call
-  // WebPPictureFree(&pic) to reclaim memory.
-
-  // Set up a byte-output write method. WebPMemoryWriter, for instance.
-  WebPMemoryWriter wrt;
-  WebPMemoryWriterInit(&wrt);     // initialize 'wrt'
-
-  pic.writer = MyFileWriter;
-  pic.custom_ptr = my_opaque_structure_to_make_MyFileWriter_work;
-
-  // Compress!
-  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
-  WebPPictureFree(&pic);  // must be called independently of the 'ok' result.
-
-  // output data should have been handled by the writer at that point.
-  // -> compressed data is the memory buffer described by wrt.mem / wrt.size
-
-  // deallocate the memory used by compressed data
-  WebPMemoryWriterClear(&wrt);
-
-------------------------------------- END PSEUDO EXAMPLE
-
-Decoding API:
-=============
-
-This is mainly just one function to call:
-
-#include "webp/decode.h"
-uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
-                       int* width, int* height);
-
-Please have a look at the file src/webp/decode.h for the details.
-There are variants for decoding in BGR/RGBA/ARGB/BGRA order, along with
-decoding to raw Y'CbCr samples. One can also decode the image directly into a
-pre-allocated buffer.
-
-To detect a WebP file and gather the picture's dimensions, the function:
-  int WebPGetInfo(const uint8_t* data, size_t data_size,
-                  int* width, int* height);
-is supplied. No decoding is involved when using it.
-
-Incremental decoding API:
-=========================
-
-In the case when data is being progressively transmitted, pictures can still
-be incrementally decoded using a slightly more complicated API. Decoder state
-is stored into an instance of the WebPIDecoder object. This object can be
-created with the purpose of decoding either RGB or Y'CbCr samples.
-For instance:
-
-  WebPDecBuffer buffer;
-  WebPInitDecBuffer(&buffer);
-  buffer.colorspace = MODE_BGR;
-  ...
-  WebPIDecoder* idec = WebPINewDecoder(&buffer);
-
-As data is made progressively available, this incremental-decoder object
-can be used to decode the picture further. There are two (mutually exclusive)
-ways to pass freshly arrived data:
-
-either by appending the fresh bytes:
-
-  WebPIAppend(idec, fresh_data, size_of_fresh_data);
-
-or by just mentioning the new size of the transmitted data:
-
-  WebPIUpdate(idec, buffer, size_of_transmitted_buffer);
-
-Note that 'buffer' can be modified between each call to WebPIUpdate, in
-particular when the buffer is resized to accommodate larger data.
-
-These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
-decoding is not finished yet or VP8_STATUS_OK when decoding is done. Any other
-status is an error condition.
-
-The 'idec' object must always be released (even upon an error condition) by
-calling: WebPDelete(idec).
-
-To retrieve partially decoded picture samples, one must use the corresponding
-method: WebPIDecGetRGB or WebPIDecGetYUVA.
-It will return the last displayable pixel row.
-
-Lastly, note that decoding can also be performed into a pre-allocated pixel
-buffer. This buffer must be passed when creating a WebPIDecoder, calling
-WebPINewRGB() or WebPINewYUVA().
-
-Please have a look at the src/webp/decode.h header for further details.
-
-Advanced Decoding API:
-======================
-
-WebP decoding supports an advanced API which provides on-the-fly cropping and
-rescaling, something of great usefulness on memory-constrained environments like
-mobile phones. Basically, the memory usage will scale with the output's size,
-not the input's, when one only needs a quick preview or a zoomed in portion of
-an otherwise too-large picture. Some CPU can be saved too, incidentally.
-
-------------------------------------- BEGIN PSEUDO EXAMPLE
-     // A) Init a configuration object
-     WebPDecoderConfig config;
-     CHECK(WebPInitDecoderConfig(&config));
-
-     // B) optional: retrieve the bitstream's features.
-     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
-
-     // C) Adjust 'config' options, if needed
-     config.options.no_fancy_upsampling = 1;
-     config.options.use_scaling = 1;
-     config.options.scaled_width = scaledWidth();
-     config.options.scaled_height = scaledHeight();
-     // etc.
-
-     // D) Specify 'config' output options for specifying output colorspace.
-     // Optionally the external image decode buffer can also be specified.
-     config.output.colorspace = MODE_BGRA;
-     // Optionally, the config.output can be pointed to an external buffer as
-     // well for decoding the image. This externally supplied memory buffer
-     // should be big enough to store the decoded picture.
-     config.output.u.RGBA.rgba = (uint8_t*) memory_buffer;
-     config.output.u.RGBA.stride = scanline_stride;
-     config.output.u.RGBA.size = total_size_of_the_memory_buffer;
-     config.output.is_external_memory = 1;
-
-     // E) Decode the WebP image. There are two variants w.r.t decoding image.
-     // The first one (E.1) decodes the full image and the second one (E.2) is
-     // used to incrementally decode the image using small input buffers.
-     // Any one of these steps can be used to decode the WebP image.
-
-     // E.1) Decode full image.
-     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
-
-     // E.2) Decode image incrementally.
-     WebPIDecoder* const idec = WebPIDecode(NULL, NULL, &config);
-     CHECK(idec != NULL);
-     while (bytes_remaining > 0) {
-       VP8StatusCode status = WebPIAppend(idec, input, bytes_read);
-       if (status == VP8_STATUS_OK || status == VP8_STATUS_SUSPENDED) {
-         bytes_remaining -= bytes_read;
-       } else {
-         break;
-       }
-     }
-     WebPIDelete(idec);
-
-     // F) Decoded image is now in config.output (and config.output.u.RGBA).
-     // It can be saved, displayed or otherwise processed.
-
-     // G) Reclaim memory allocated in config's object. It's safe to call
-     // this function even if the memory is external and wasn't allocated
-     // by WebPDecode().
-     WebPFreeDecBuffer(&config.output);
-
-------------------------------------- END PSEUDO EXAMPLE
-
-Bugs:
-=====
-
-Please report all bugs to the issue tracker:
-    https://bugs.chromium.org/p/webp
-Patches welcome! See this page to get started:
-    https://www.webmproject.org/code/contribute/submitting-patches/
-
-Discuss:
-========
-
-Email: webp-discuss@webmproject.org
-Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/README.md
+++ b/README.md
@ -0,0 +1,53 @@
+# WebP Codec
+
+```
+      __   __  ____  ____  ____
+     /  \\/  \/  _ \/  _ )/  _ \
+     \       /   __/  _  \   __/
+      \__\__/\____/\_____/__/ ____  ___
+            / _/ /    \    \ /  _ \/ _/
+           /  \_/   / /   \ \   __/  \__
+           \____/____/\_____/_____/____/v1.2.3
+```
+
+WebP codec is a library to encode and decode images in WebP format. This package
+contains the library that can be used in other programs to add WebP support, as
+well as the command line tools 'cwebp' and 'dwebp' to compress and decompress
+images respectively.
+
+See https://developers.google.com/speed/webp for details on the image format.
+
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
+
+It is released under the same license as the WebM project. See
+https://www.webmproject.org/license/software/ or the "COPYING" file for details.
+An additional intellectual property rights grant can be found in the file
+PATENTS.
+
+## Building
+
+See the [building documentation](doc/building.md).
+
+## Encoding and Decoding Tools
+
+The examples/ directory contains tools to encode and decode images and
+animations, view information about WebP images, and more. See the
+[tools documentation](doc/tools.md).
+
+## APIs
+
+See the [APIs documentation](doc/api.md), and API usage examples in the
+`examples/` directory.
+
+## Bugs
+
+Please report all bugs to the issue tracker: https://bugs.chromium.org/p/webp
+
+Patches welcome! See [how to contribute](CONTRIBUTING.md).
+
+## Discuss
+
+Email: webp-discuss@webmproject.org
+
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/README.mux
+++ b/README.mux
@ -1,258 +0,0 @@
-          __   __  ____  ____  ____  __ __  _     __ __
-         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
-         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v1.2.2
-
-
-Description:
-============
-
-WebPMux: set of two libraries 'Mux' and 'Demux' for creation, extraction and
-manipulation of an extended format WebP file, which can have features like
-color profile, metadata and animation. Reference command-line tools 'webpmux'
-and 'vwebp' as well as the WebP container specification
-'doc/webp-container-spec.txt' are also provided in this package.
-
-WebP Mux tool:
-==============
-
-The examples/ directory contains a tool (webpmux) for manipulating WebP
-files. The webpmux tool can be used to create an extended format WebP file and
-also to extract or strip relevant data from such a file.
-
-A list of options is available using the -help command line flag:
-
-> webpmux -help
-Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
-       webpmux -set SET_OPTIONS INPUT -o OUTPUT
-       webpmux -duration DURATION_OPTIONS [-duration ...]
-               INPUT -o OUTPUT
-       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT
-       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]
-               [-bgcolor BACKGROUND_COLOR] -o OUTPUT
-       webpmux -info INPUT
-       webpmux [-h|-help]
-       webpmux -version
-       webpmux argument_file_name
-
-GET_OPTIONS:
- Extract relevant data:
-   icc       get ICC profile
-   exif      get EXIF metadata
-   xmp       get XMP metadata
-   frame n   get nth frame
-
-SET_OPTIONS:
- Set color profile/metadata/parameters:
-   loop LOOP_COUNT            set the loop count
-   bgcolor BACKGROUND_COLOR   set the animation background color
-   icc  file.icc              set ICC profile
-   exif file.exif             set EXIF metadata
-   xmp  file.xmp              set XMP metadata
-   where:    'file.icc' contains the ICC profile to be set,
-             'file.exif' contains the EXIF metadata to be set
-             'file.xmp' contains the XMP metadata to be set
-
-DURATION_OPTIONS:
- Set duration of selected frames:
-   duration            set duration for each frames
-   duration,frame      set duration of a particular frame
-   duration,start,end  set duration of frames in the
-                        interval [start,end])
-   where: 'duration' is the duration in milliseconds
-          'start' is the start frame index
-          'end' is the inclusive end frame index
-           The special 'end' value '0' means: last frame.
-
-STRIP_OPTIONS:
- Strip color profile/metadata:
-   icc       strip ICC profile
-   exif      strip EXIF metadata
-   xmp       strip XMP metadata
-
-FRAME_OPTIONS(i):
- Create animation:
-   file_i +di+[xi+yi[+mi[bi]]]
-   where:    'file_i' is the i'th animation frame (WebP format),
-             'di' is the pause duration before next frame,
-             'xi','yi' specify the image offset for this frame,
-             'mi' is the dispose method for this frame (0 or 1),
-             'bi' is the blending method for this frame (+b or -b)
-
-LOOP_COUNT:
- Number of times to repeat the animation.
- Valid range is 0 to 65535 [Default: 0 (infinite)].
-
-BACKGROUND_COLOR:
- Background color of the canvas.
-  A,R,G,B
-  where:    'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying
-            the Alpha, Red, Green and Blue component values respectively
-            [Default: 255,255,255,255]
-
-INPUT & OUTPUT are in WebP format.
-
-Note: The nature of EXIF, XMP and ICC data is not checked and is assumed to be
-valid.
-
-Note: if a single file name is passed as the argument, the arguments will be
-tokenized from this file. The file name must not start with the character '-'.
-
-Visualization tool:
-===================
-
-The examples/ directory also contains a tool (vwebp) for viewing WebP files.
-It decodes the image and visualizes it using OpenGL. See the libwebp README
-for details on building and running this program.
-
-Mux API:
-========
-The Mux API contains methods for adding data to and reading data from WebP
-files. This API currently supports XMP/EXIF metadata, ICC profile and animation.
-Other features may be added in subsequent releases.
-
-Example#1 (pseudo code): Creating a WebPMux object with image data, color
-profile and XMP metadata.
-
-  int copy_data = 0;
-  WebPMux* mux = WebPMuxNew();
-  // ... (Prepare image data).
-  WebPMuxSetImage(mux, &image, copy_data);
-  // ... (Prepare ICC profile data).
-  WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
-  // ... (Prepare XMP metadata).
-  WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
-  // Get data from mux in WebP RIFF format.
-  WebPMuxAssemble(mux, &output_data);
-  WebPMuxDelete(mux);
-  // ... (Consume output_data; e.g. write output_data.bytes to file).
-  WebPDataClear(&output_data);
-
-
-Example#2 (pseudo code): Get image and color profile data from a WebP file.
-
-  int copy_data = 0;
-  // ... (Read data from file).
-  WebPMux* mux = WebPMuxCreate(&data, copy_data);
-  WebPMuxGetFrame(mux, 1, &image);
-  // ... (Consume image; e.g. call WebPDecode() to decode the data).
-  WebPMuxGetChunk(mux, "ICCP", &icc_profile);
-  // ... (Consume icc_profile).
-  WebPMuxDelete(mux);
-  free(data);
-
-
-For a detailed Mux API reference, please refer to the header file
-(src/webp/mux.h).
-
-Demux API:
-==========
-The Demux API enables extraction of images and extended format data from
-WebP files. This API currently supports reading of XMP/EXIF metadata, ICC
-profile and animated images. Other features may be added in subsequent
-releases.
-
-Code example: Demuxing WebP data to extract all the frames, ICC profile
-and EXIF/XMP metadata.
-
-  WebPDemuxer* demux = WebPDemux(&webp_data);
-  uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
-  uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
-  // ... (Get information about the features present in the WebP file).
-  uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
-
-  // ... (Iterate over all frames).
-  WebPIterator iter;
-  if (WebPDemuxGetFrame(demux, 1, &iter)) {
-    do {
-      // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
-      // ... and get other frame properties like width, height, offsets etc.
-      // ... see 'struct WebPIterator' below for more info).
-    } while (WebPDemuxNextFrame(&iter));
-    WebPDemuxReleaseIterator(&iter);
-  }
-
-  // ... (Extract metadata).
-  WebPChunkIterator chunk_iter;
-  if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
-  // ... (Consume the ICC profile in 'chunk_iter.chunk').
-  WebPDemuxReleaseChunkIterator(&chunk_iter);
-  if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
-  // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
-  WebPDemuxReleaseChunkIterator(&chunk_iter);
-  if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
-  // ... (Consume the XMP metadata in 'chunk_iter.chunk').
-  WebPDemuxReleaseChunkIterator(&chunk_iter);
-  WebPDemuxDelete(demux);
-
-
-For a detailed Demux API reference, please refer to the header file
-(src/webp/demux.h).
-
-AnimEncoder API:
-================
-The AnimEncoder API can be used to create animated WebP images.
-
-Code example:
-
-  WebPAnimEncoderOptions enc_options;
-  WebPAnimEncoderOptionsInit(&enc_options);
-  // ... (Tune 'enc_options' as needed).
-  WebPAnimEncoder* enc = WebPAnimEncoderNew(width, height, &enc_options);
-  while(<there are more frames>) {
-    WebPConfig config;
-    WebPConfigInit(&config);
-    // ... (Tune 'config' as needed).
-    WebPAnimEncoderAdd(enc, frame, duration, &config);
-  }
-  WebPAnimEncoderAssemble(enc, webp_data);
-  WebPAnimEncoderDelete(enc);
-  // ... (Write the 'webp_data' to a file, or re-mux it further).
-
-
-For a detailed AnimEncoder API reference, please refer to the header file
-(src/webp/mux.h).
-
-AnimDecoder API:
-================
-This AnimDecoder API allows decoding (possibly) animated WebP images.
-
-Code Example:
-
-  WebPAnimDecoderOptions dec_options;
-  WebPAnimDecoderOptionsInit(&dec_options);
-  // Tune 'dec_options' as needed.
-  WebPAnimDecoder* dec = WebPAnimDecoderNew(webp_data, &dec_options);
-  WebPAnimInfo anim_info;
-  WebPAnimDecoderGetInfo(dec, &anim_info);
-  for (uint32_t i = 0; i < anim_info.loop_count; ++i) {
-    while (WebPAnimDecoderHasMoreFrames(dec)) {
-      uint8_t* buf;
-      int timestamp;
-      WebPAnimDecoderGetNext(dec, &buf, &timestamp);
-      // ... (Render 'buf' based on 'timestamp').
-      // ... (Do NOT free 'buf', as it is owned by 'dec').
-    }
-    WebPAnimDecoderReset(dec);
-  }
-  const WebPDemuxer* demuxer = WebPAnimDecoderGetDemuxer(dec);
-  // ... (Do something using 'demuxer'; e.g. get EXIF/XMP/ICC data).
-  WebPAnimDecoderDelete(dec);
-
-For a detailed AnimDecoder API reference, please refer to the header file
-(src/webp/demux.h).
-
-
-Bugs:
-=====
-
-Please report all bugs to the issue tracker:
-    https://bugs.chromium.org/p/webp
-Patches welcome! See this page to get started:
-    https://www.webmproject.org/code/contribute/submitting-patches/
-
-Discuss:
-========
-
-Email: webp-discuss@webmproject.org
-Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/README.webp_js
+++ b/README.webp_js
@ -1,75 +0,0 @@
-     __   __ ____ ____ ____     __  ____
-    /  \\/  \  _ \  _ \  _ \   (__)/  __\
-    \       /  __/ _  \  __/   _)  \_   \
-     \__\__/_____/____/_/     /____/____/
-
-Description:
-============
-
-This file describes the compilation of libwebp into a JavaScript decoder
-using Emscripten and CMake.
-
- - install the Emscripten SDK following the procedure described at:
-   https://emscripten.org/docs/getting_started/downloads.html#installation-instructions-using-the-emsdk-recommended
-   After installation, you should have some global variable positioned to the
-   location of the SDK. In particular, $EMSDK should point to the
-   top-level directory containing Emscripten tools.
-
- - configure the project 'WEBP_JS' with CMake using:
-
- cd webp_js && \
- emcmake cmake -DWEBP_BUILD_WEBP_JS=ON \
-       ../
-
- - compile webp.js using 'emmake make'.
-
- - that's it! Upon completion, you should have the webp.js and
-   webp.wasm files generated.
-
-The callable JavaScript function is WebPToSDL(), which decodes a raw WebP
-bitstream into a canvas. See webp_js/index.html for a simple usage sample
-(see below for instructions).
-
-Demo HTML page:
-===============
-
-   The HTML page webp_js/index.html requires an HTTP server to serve the WebP
-   image example. It's easy to just use Python for that.
-
-cd webp_js && python -m SimpleHTTPServer 8080
-
-and then navigate to http://localhost:8080 in your favorite browser.
-
-
-Web-Assembly (WASM) version:
-============================
-
-  CMakeLists.txt is configured to build the WASM version when using
-  the option WEBP_BUILD_WEBP_JS=ON. The compilation step will assemble
-  the files 'webp_wasm.js', 'webp_wasm.wasm' in the webp_js/ directory.
-  See webp_js/index_wasm.html for a simple demo page using the WASM version
-  of the library.
-
-  You will need a fairly recent version of Emscripten (at least 2.0.18,
-  latest-upstream is recommended) and of your WASM-enabled browser to run this
-  version.
-
-Caveat:
-=======
-
-  - First decoding using the library is usually slower, due to just-in-time
-    compilation.
-
-  - Some versions of llvm produce the following compile error when SSE2 is
-    enabled.
-
-"Unsupported:   %516 = bitcast <8 x i16> %481 to i128
- LLVM ERROR: BitCast Instruction not yet supported for integer types larger than 64 bits"
-
-    The corresponding Emscripten bug is at:
-    https://github.com/kripken/emscripten/issues/3788
-
-    Therefore, SSE2 optimization is currently disabled in CMakeLists.txt.
-
-  - If WEBP_ENABLE_SIMD is set to 1 the JavaScript version (webp.js) will be
-    disabled as wasm2js does not support SIMD.
--- a/build.gradle
+++ b/build.gradle
@ -105,6 +105,13 @@ model {
      sources {
        c {
          source {
+            srcDir "sharpyuv"
+            include "sharpyuv.c"
+            include "sharpyuv_csp.c"
+            include "sharpyuv_dsp.c"
+            include "sharpyuv_gamma.c"
+            include "sharpyuv_neon.c"
+            include "sharpyuv_sse2.c"
            srcDir "src/dec"
            include "alpha_dec.c"
            include "buffer_dec.c"
--- a/cmake/deps.cmake
+++ b/cmake/deps.cmake
@ -45,7 +45,7 @@ if(WEBP_USE_THREAD)
        }
      " FLAG_HAVE_PTHREAD_PRIO_INHERIT)
    set(HAVE_PTHREAD_PRIO_INHERIT ${FLAG_HAVE_PTHREAD_PRIO_INHERIT})
-    list(APPEND WEBP_DEP_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+    list(APPEND WEBP_DEP_LIBRARIES Threads::Threads)
  endif()
  set(WEBP_USE_THREAD ${Threads_FOUND})
 endif()
@ -74,6 +74,11 @@ endif()
 set(WEBP_DEP_IMG_LIBRARIES)
 set(WEBP_DEP_IMG_INCLUDE_DIRS)
 foreach(I_LIB PNG JPEG TIFF)
+  # Disable tiff when compiling in static mode as it is failing on Ubuntu.
+  if(WEBP_LINK_STATIC AND ${I_LIB} STREQUAL "TIFF")
+    message("TIFF is disabled when statically linking.")
+    continue()
+  endif()
  find_package(${I_LIB})
  set(WEBP_HAVE_${I_LIB} ${${I_LIB}_FOUND})
  if(${I_LIB}_FOUND)
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([libwebp], [1.2.2],
+AC_INIT([libwebp], [1.2.3],
        [https://bugs.chromium.org/p/webp],,
        [https://developers.google.com/speed/webp])
 AC_CANONICAL_HOST
@ -9,7 +9,8 @@ dnl === automake >= 1.12 requires this for 'unusual archivers' support.
 dnl === it must occur before LT_INIT (AC_PROG_LIBTOOL).
 m4_ifdef([AM_PROG_AR], [AM_PROG_AR])

-AC_PROG_LIBTOOL
+dnl === AC_PROG_LIBTOOL is deprecated.
+m4_ifdef([LT_INIT], [LT_INIT], [AC_PROG_LIBTOOL])
 AC_PROG_SED
 AM_PROG_CC_C_O

@ -27,7 +28,8 @@ AC_ARG_ENABLE([everything],
              AS_HELP_STRING([--enable-everything],
                             [Enable all optional targets. These can still be
                              disabled with --disable-target]),
-              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
+              [SET_IF_UNSET([enable_libsharpyuv], [$enableval])
+               SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
               SET_IF_UNSET([enable_libwebpextras], [$enableval])
               SET_IF_UNSET([enable_libwebpmux], [$enableval])])
@ -751,6 +753,7 @@ AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_HEADERS([src/webp/config.h])
 AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile extras/Makefile imageio/Makefile \
+                 sharpyuv/Makefile \
                 src/dec/Makefile src/enc/Makefile src/dsp/Makefile \
                 src/demux/Makefile src/mux/Makefile \
                 src/utils/Makefile \
--- a/doc/README
+++ b/doc/README
@ -1,29 +0,0 @@
-
-Generate libwebp Container Spec Docs from Text Source
-=====================================================
-
-HTML generation requires kramdown [1], easily installed as a
-rubygem [2].  Rubygems installation should satisfy dependencies
-automatically.
-
-[1]: https://kramdown.gettalong.org/
-[2]: https://rubygems.org/
-
-HTML generation can then be done from the project root:
-
-$ kramdown doc/webp-container-spec.txt --template doc/template.html > \
-  doc/output/webp-container-spec.html
-
-kramdown can optionally syntax highlight code blocks, using CodeRay [3],
-a dependency of kramdown that rubygems will install automatically.  The
-following will apply inline CSS styling; an external stylesheet is not
-needed.
-
-$ kramdown doc/webp-lossless-bitstream-spec.txt --template \
-  doc/template.html --coderay-css style --coderay-line-numbers ' ' \
-  --coderay-default-lang c > \
-  doc/output/webp-lossless-bitstream-spec.html
-
-Optimally, use kramdown 0.13.7 or newer if syntax highlighting desired.
-
-[3]: https://github.com/rubychan/coderay
--- a/doc/api.md
+++ b/doc/api.md
@ -0,0 +1,385 @@
+# WebP APIs
+
+## Encoding API
+
+The main encoding functions are available in the header src/webp/encode.h
+
+The ready-to-use ones are:
+
+```c
+size_t WebPEncodeRGB(const uint8_t* rgb, int width, int height, int stride,
+                     float quality_factor, uint8_t** output);
+size_t WebPEncodeBGR(const uint8_t* bgr, int width, int height, int stride,
+                     float quality_factor, uint8_t** output);
+size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride,
+                      float quality_factor, uint8_t** output);
+size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
+                      float quality_factor, uint8_t** output);
+```
+
+They will convert raw RGB samples to a WebP data. The only control supplied is
+the quality factor.
+
+There are some variants for using the lossless format:
+
+```c
+size_t WebPEncodeLosslessRGB(const uint8_t* rgb, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGR(const uint8_t* bgr, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessRGBA(const uint8_t* rgba, int width, int height,
+                              int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGRA(const uint8_t* bgra, int width, int height,
+                              int stride, uint8_t** output);
+```
+
+Of course in this case, no quality factor is needed since the compression occurs
+without loss of the input values, at the expense of larger output sizes.
+
+### Advanced encoding API
+
+A more advanced API is based on the WebPConfig and WebPPicture structures.
+
+WebPConfig contains the encoding settings and is not tied to a particular
+picture. WebPPicture contains input data, on which some WebPConfig will be used
+for compression. The encoding flow looks like:
+
+```c
+#include <webp/encode.h>
+
+// Setup a config, starting form a preset and tuning some additional
+// parameters
+WebPConfig config;
+if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) {
+  return 0;   // version error
+}
+// ... additional tuning
+config.sns_strength = 90;
+config.filter_sharpness = 6;
+config_error = WebPValidateConfig(&config);  // not mandatory, but useful
+
+// Setup the input data
+WebPPicture pic;
+if (!WebPPictureInit(&pic)) {
+  return 0;  // version error
+}
+pic.width = width;
+pic.height = height;
+// allocated picture of dimension width x height
+if (!WebPPictureAlloc(&pic)) {
+  return 0;   // memory error
+}
+// at this point, 'pic' has been initialized as a container,
+// and can receive the Y/U/V samples.
+// Alternatively, one could use ready-made import functions like
+// WebPPictureImportRGB(), which will take care of memory allocation.
+// In any case, past this point, one will have to call
+// WebPPictureFree(&pic) to reclaim memory.
+
+// Set up a byte-output write method. WebPMemoryWriter, for instance.
+WebPMemoryWriter wrt;
+WebPMemoryWriterInit(&wrt);     // initialize 'wrt'
+
+pic.writer = MyFileWriter;
+pic.custom_ptr = my_opaque_structure_to_make_MyFileWriter_work;
+
+// Compress!
+int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
+WebPPictureFree(&pic);  // must be called independently of the 'ok' result.
+
+// output data should have been handled by the writer at that point.
+// -> compressed data is the memory buffer described by wrt.mem / wrt.size
+
+// deallocate the memory used by compressed data
+WebPMemoryWriterClear(&wrt);
+```
+
+## Decoding API
+
+This is mainly just one function to call:
+
+```c
+#include "webp/decode.h"
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                       int* width, int* height);
+```
+
+Please have a look at the file src/webp/decode.h for the details. There are
+variants for decoding in BGR/RGBA/ARGB/BGRA order, along with decoding to raw
+Y'CbCr samples. One can also decode the image directly into a pre-allocated
+buffer.
+
+To detect a WebP file and gather the picture's dimensions, the function:
+
+```c
+int WebPGetInfo(const uint8_t* data, size_t data_size,
+                int* width, int* height);
+```
+
+is supplied. No decoding is involved when using it.
+
+### Incremental decoding API
+
+In the case when data is being progressively transmitted, pictures can still be
+incrementally decoded using a slightly more complicated API. Decoder state is
+stored into an instance of the WebPIDecoder object. This object can be created
+with the purpose of decoding either RGB or Y'CbCr samples. For instance:
+
+```c
+WebPDecBuffer buffer;
+WebPInitDecBuffer(&buffer);
+buffer.colorspace = MODE_BGR;
+...
+WebPIDecoder* idec = WebPINewDecoder(&buffer);
+```
+
+As data is made progressively available, this incremental-decoder object can be
+used to decode the picture further. There are two (mutually exclusive) ways to
+pass freshly arrived data:
+
+either by appending the fresh bytes:
+
+```c
+WebPIAppend(idec, fresh_data, size_of_fresh_data);
+```
+
+or by just mentioning the new size of the transmitted data:
+
+```c
+WebPIUpdate(idec, buffer, size_of_transmitted_buffer);
+```
+
+Note that 'buffer' can be modified between each call to WebPIUpdate, in
+particular when the buffer is resized to accommodate larger data.
+
+These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
+decoding is not finished yet or VP8_STATUS_OK when decoding is done. Any other
+status is an error condition.
+
+The 'idec' object must always be released (even upon an error condition) by
+calling: WebPDelete(idec).
+
+To retrieve partially decoded picture samples, one must use the corresponding
+method: WebPIDecGetRGB or WebPIDecGetYUVA. It will return the last displayable
+pixel row.
+
+Lastly, note that decoding can also be performed into a pre-allocated pixel
+buffer. This buffer must be passed when creating a WebPIDecoder, calling
+WebPINewRGB() or WebPINewYUVA().
+
+Please have a look at the src/webp/decode.h header for further details.
+
+### Advanced Decoding API
+
+WebP decoding supports an advanced API which provides on-the-fly cropping and
+rescaling, something of great usefulness on memory-constrained environments like
+mobile phones. Basically, the memory usage will scale with the output's size,
+not the input's, when one only needs a quick preview or a zoomed in portion of
+an otherwise too-large picture. Some CPU can be saved too, incidentally.
+
+```c
+// A) Init a configuration object
+WebPDecoderConfig config;
+CHECK(WebPInitDecoderConfig(&config));
+
+// B) optional: retrieve the bitstream's features.
+CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+// C) Adjust 'config' options, if needed
+config.options.no_fancy_upsampling = 1;
+config.options.use_scaling = 1;
+config.options.scaled_width = scaledWidth();
+config.options.scaled_height = scaledHeight();
+// etc.
+
+// D) Specify 'config' output options for specifying output colorspace.
+// Optionally the external image decode buffer can also be specified.
+config.output.colorspace = MODE_BGRA;
+// Optionally, the config.output can be pointed to an external buffer as
+// well for decoding the image. This externally supplied memory buffer
+// should be big enough to store the decoded picture.
+config.output.u.RGBA.rgba = (uint8_t*) memory_buffer;
+config.output.u.RGBA.stride = scanline_stride;
+config.output.u.RGBA.size = total_size_of_the_memory_buffer;
+config.output.is_external_memory = 1;
+
+// E) Decode the WebP image. There are two variants w.r.t decoding image.
+// The first one (E.1) decodes the full image and the second one (E.2) is
+// used to incrementally decode the image using small input buffers.
+// Any one of these steps can be used to decode the WebP image.
+
+// E.1) Decode full image.
+CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+// E.2) Decode image incrementally.
+WebPIDecoder* const idec = WebPIDecode(NULL, NULL, &config);
+CHECK(idec != NULL);
+while (bytes_remaining > 0) {
+  VP8StatusCode status = WebPIAppend(idec, input, bytes_read);
+  if (status == VP8_STATUS_OK || status == VP8_STATUS_SUSPENDED) {
+    bytes_remaining -= bytes_read;
+  } else {
+    break;
+  }
+}
+WebPIDelete(idec);
+
+// F) Decoded image is now in config.output (and config.output.u.RGBA).
+// It can be saved, displayed or otherwise processed.
+
+// G) Reclaim memory allocated in config's object. It's safe to call
+// this function even if the memory is external and wasn't allocated
+// by WebPDecode().
+WebPFreeDecBuffer(&config.output);
+```
+
+## Webp Mux
+
+WebPMux is a set of two libraries 'Mux' and 'Demux' for creation, extraction and
+manipulation of an extended format WebP file, which can have features like color
+profile, metadata and animation. Reference command-line tools `webpmux` and
+`vwebp` as well as the WebP container specification
+'doc/webp-container-spec.txt' are also provided in this package, see the
+[tools documentation](tools.md).
+
+### Mux API
+
+The Mux API contains methods for adding data to and reading data from WebP
+files. This API currently supports XMP/EXIF metadata, ICC profile and animation.
+Other features may be added in subsequent releases.
+
+Example#1 (pseudo code): Creating a WebPMux object with image data, color
+profile and XMP metadata.
+
+```c
+int copy_data = 0;
+WebPMux* mux = WebPMuxNew();
+// ... (Prepare image data).
+WebPMuxSetImage(mux, &image, copy_data);
+// ... (Prepare ICC profile data).
+WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
+// ... (Prepare XMP metadata).
+WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
+// Get data from mux in WebP RIFF format.
+WebPMuxAssemble(mux, &output_data);
+WebPMuxDelete(mux);
+// ... (Consume output_data; e.g. write output_data.bytes to file).
+WebPDataClear(&output_data);
+```
+
+Example#2 (pseudo code): Get image and color profile data from a WebP file.
+
+```c
+int copy_data = 0;
+// ... (Read data from file).
+WebPMux* mux = WebPMuxCreate(&data, copy_data);
+WebPMuxGetFrame(mux, 1, &image);
+// ... (Consume image; e.g. call WebPDecode() to decode the data).
+WebPMuxGetChunk(mux, "ICCP", &icc_profile);
+// ... (Consume icc_profile).
+WebPMuxDelete(mux);
+free(data);
+```
+
+For a detailed Mux API reference, please refer to the header file
+(src/webp/mux.h).
+
+### Demux API
+
+The Demux API enables extraction of images and extended format data from WebP
+files. This API currently supports reading of XMP/EXIF metadata, ICC profile and
+animated images. Other features may be added in subsequent releases.
+
+Code example: Demuxing WebP data to extract all the frames, ICC profile and
+EXIF/XMP metadata.
+
+```c
+WebPDemuxer* demux = WebPDemux(&webp_data);
+uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
+uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
+// ... (Get information about the features present in the WebP file).
+uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
+
+// ... (Iterate over all frames).
+WebPIterator iter;
+if (WebPDemuxGetFrame(demux, 1, &iter)) {
+  do {
+    // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
+    // ... and get other frame properties like width, height, offsets etc.
+    // ... see 'struct WebPIterator' below for more info).
+  } while (WebPDemuxNextFrame(&iter));
+  WebPDemuxReleaseIterator(&iter);
+}
+
+// ... (Extract metadata).
+WebPChunkIterator chunk_iter;
+if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
+// ... (Consume the ICC profile in 'chunk_iter.chunk').
+WebPDemuxReleaseChunkIterator(&chunk_iter);
+if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
+// ... (Consume the EXIF metadata in 'chunk_iter.chunk').
+WebPDemuxReleaseChunkIterator(&chunk_iter);
+if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
+// ... (Consume the XMP metadata in 'chunk_iter.chunk').
+WebPDemuxReleaseChunkIterator(&chunk_iter);
+WebPDemuxDelete(demux);
+```
+
+For a detailed Demux API reference, please refer to the header file
+(src/webp/demux.h).
+
+## AnimEncoder API
+
+The AnimEncoder API can be used to create animated WebP images.
+
+Code example:
+
+```c
+WebPAnimEncoderOptions enc_options;
+WebPAnimEncoderOptionsInit(&enc_options);
+// ... (Tune 'enc_options' as needed).
+WebPAnimEncoder* enc = WebPAnimEncoderNew(width, height, &enc_options);
+while(<there are more frames>) {
+  WebPConfig config;
+  WebPConfigInit(&config);
+  // ... (Tune 'config' as needed).
+  WebPAnimEncoderAdd(enc, frame, duration, &config);
+}
+WebPAnimEncoderAssemble(enc, webp_data);
+WebPAnimEncoderDelete(enc);
+// ... (Write the 'webp_data' to a file, or re-mux it further).
+```
+
+For a detailed AnimEncoder API reference, please refer to the header file
+(src/webp/mux.h).
+
+## AnimDecoder API
+
+This AnimDecoder API allows decoding (possibly) animated WebP images.
+
+Code Example:
+
+```c
+WebPAnimDecoderOptions dec_options;
+WebPAnimDecoderOptionsInit(&dec_options);
+// Tune 'dec_options' as needed.
+WebPAnimDecoder* dec = WebPAnimDecoderNew(webp_data, &dec_options);
+WebPAnimInfo anim_info;
+WebPAnimDecoderGetInfo(dec, &anim_info);
+for (uint32_t i = 0; i < anim_info.loop_count; ++i) {
+  while (WebPAnimDecoderHasMoreFrames(dec)) {
+    uint8_t* buf;
+    int timestamp;
+    WebPAnimDecoderGetNext(dec, &buf, &timestamp);
+    // ... (Render 'buf' based on 'timestamp').
+    // ... (Do NOT free 'buf', as it is owned by 'dec').
+  }
+  WebPAnimDecoderReset(dec);
+}
+const WebPDemuxer* demuxer = WebPAnimDecoderGetDemuxer(dec);
+// ... (Do something using 'demuxer'; e.g. get EXIF/XMP/ICC data).
+WebPAnimDecoderDelete(dec);
+```
+
+For a detailed AnimDecoder API reference, please refer to the header file
+(src/webp/demux.h).
--- a/doc/building.md
+++ b/doc/building.md
@ -0,0 +1,213 @@
+# Building
+
+## Windows build
+
+By running:
+
+```batch
+nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output
+```
+
+the directory `output\release-static\(x64|x86)\bin` will contain the tools
+cwebp.exe and dwebp.exe. The directory `output\release-static\(x64|x86)\lib`
+will contain the libwebp static library. The target architecture (x86/x64) is
+detected by Makefile.vc from the Visual Studio compiler (cl.exe) available in
+the system path.
+
+## Unix build using makefile.unix
+
+On platforms with GNU tools installed (gcc and make), running
+
+```shell
+make -f makefile.unix
+```
+
+will build the binaries examples/cwebp and examples/dwebp, along with the static
+library src/libwebp.a. No system-wide installation is supplied, as this is a
+simple alternative to the full installation system based on the autoconf tools
+(see below). Please refer to makefile.unix for additional details and
+customizations.
+
+## Using autoconf tools
+
+Prerequisites: a compiler (e.g., gcc), make, autoconf, automake, libtool.
+
+On a Debian-like system the following should install everything you need for a
+minimal build:
+
+```shell
+$ sudo apt-get install gcc make autoconf automake libtool
+```
+
+When building from git sources, you will need to run autogen.sh to generate the
+configure script.
+
+```shell
+./configure
+make
+make install
+```
+
+should be all you need to have the following files
+
+```
+/usr/local/include/webp/decode.h
+/usr/local/include/webp/encode.h
+/usr/local/include/webp/types.h
+/usr/local/lib/libwebp.*
+/usr/local/bin/cwebp
+/usr/local/bin/dwebp
+```
+
+installed.
+
+Note: A decode-only library, libwebpdecoder, is available using the
+`--enable-libwebpdecoder` flag. The encode library is built separately and can
+be installed independently using a minor modification in the corresponding
+Makefile.am configure files (see comments there). See `./configure --help` for
+more options.
+
+## Building for MIPS Linux
+
+MIPS Linux toolchain stable available releases can be found at:
+https://community.imgtec.com/developers/mips/tools/codescape-mips-sdk/available-releases/
+
+```shell
+# Add toolchain to PATH
+export PATH=$PATH:/path/to/toolchain/bin
+
+# 32-bit build for mips32r5 (p5600)
+HOST=mips-mti-linux-gnu
+MIPS_CFLAGS="-O3 -mips32r5 -mabi=32 -mtune=p5600 -mmsa -mfp64 \
+  -msched-weight -mload-store-pairs -fPIE"
+MIPS_LDFLAGS="-mips32r5 -mabi=32 -mmsa -mfp64 -pie"
+
+# 64-bit build for mips64r6 (i6400)
+HOST=mips-img-linux-gnu
+MIPS_CFLAGS="-O3 -mips64r6 -mabi=64 -mtune=i6400 -mmsa -mfp64 \
+  -msched-weight -mload-store-pairs -fPIE"
+MIPS_LDFLAGS="-mips64r6 -mabi=64 -mmsa -mfp64 -pie"
+
+./configure --host=${HOST} --build=`config.guess` \
+  CC="${HOST}-gcc -EL" \
+  CFLAGS="$MIPS_CFLAGS" \
+  LDFLAGS="$MIPS_LDFLAGS"
+make
+make install
+```
+
+## CMake
+
+With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
+and the JS bindings.
+
+Prerequisites: a compiler (e.g., gcc with autotools) and CMake.
+
+On a Debian-like system the following should install everything you need for a
+minimal build:
+
+```shell
+$ sudo apt-get install build-essential cmake
+```
+
+When building from git sources, you will need to run cmake to generate the
+makefiles.
+
+```shell
+mkdir build && cd build && cmake ../
+make
+make install
+```
+
+If you also want any of the executables, you will need to enable them through
+CMake, e.g.:
+
+```shell
+cmake -DWEBP_BUILD_CWEBP=ON -DWEBP_BUILD_DWEBP=ON ../
+```
+
+or through your favorite interface (like ccmake or cmake-qt-gui).
+
+Use option `-DWEBP_UNICODE=ON` for Unicode support on Windows (with chcp 65001).
+
+Finally, once installed, you can also use WebP in your CMake project by doing:
+
+```cmake
+find_package(WebP)
+```
+
+which will define the CMake variables WebP_INCLUDE_DIRS and WebP_LIBRARIES.
+
+## Gradle
+
+The support for Gradle is minimal: it only helps you compile libwebp, cwebp and
+dwebp and webpmux_example.
+
+Prerequisites: a compiler (e.g., gcc with autotools) and gradle.
+
+On a Debian-like system the following should install everything you need for a
+minimal build:
+
+```shell
+$ sudo apt-get install build-essential gradle
+```
+
+When building from git sources, you will need to run the Gradle wrapper with the
+appropriate target, e.g. :
+
+```shell
+./gradlew buildAllExecutables
+```
+
+## SWIG bindings
+
+To generate language bindings from swig/libwebp.swig at least swig-1.3
+(http://www.swig.org) is required.
+
+Currently the following functions are mapped:
+
+Decode:
+
+```
+WebPGetDecoderVersion
+WebPGetInfo
+WebPDecodeRGBA
+WebPDecodeARGB
+WebPDecodeBGRA
+WebPDecodeBGR
+WebPDecodeRGB
+```
+
+Encode:
+
+```
+WebPGetEncoderVersion
+WebPEncodeRGBA
+WebPEncodeBGRA
+WebPEncodeRGB
+WebPEncodeBGR
+WebPEncodeLosslessRGBA
+WebPEncodeLosslessBGRA
+WebPEncodeLosslessRGB
+WebPEncodeLosslessBGR
+```
+
+See also the [swig documentation](../swig/README.md) for more detailed build
+instructions and usage examples.
+
+### Java bindings
+
+To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent) is
+necessary for enum support. The output is intended to be a shared object / DLL
+that can be loaded via `System.loadLibrary("webp_jni")`.
+
+### Python bindings
+
+To build the swig-generated Python extension code at least Python 2.6 is
+required. Python < 2.6 may build with some minor changes to libwebp.swig or the
+generated code, but is untested.
+
+## Javascript decoder
+
+Libwebp can be compiled into a JavaScript decoder using Emscripten and CMake.
+See the [corresponding documentation](../README.md)
--- a/doc/specs_generation.md
+++ b/doc/specs_generation.md
@ -0,0 +1,26 @@
+# Generate libwebp Container Spec Docs from Text Source
+
+HTML generation requires [kramdown](https://kramdown.gettalong.org/), easily
+installed as a [rubygem](https://rubygems.org/). Rubygems installation should
+satisfy dependencies automatically.
+
+HTML generation can then be done from the project root:
+
+```shell
+$ kramdown doc/webp-container-spec.txt --template doc/template.html > \
+  doc/output/webp-container-spec.html
+```
+
+kramdown can optionally syntax highlight code blocks, using
+[CodeRay](https://github.com/rubychan/coderay), a dependency of kramdown that
+rubygems will install automatically. The following will apply inline CSS
+styling; an external stylesheet is not needed.
+
+```shell
+$ kramdown doc/webp-lossless-bitstream-spec.txt --template \
+  doc/template.html --coderay-css style --coderay-line-numbers ' ' \
+  --coderay-default-lang c > \
+  doc/output/webp-lossless-bitstream-spec.html
+```
+
+Optimally, use kramdown 0.13.7 or newer if syntax highlighting desired.
--- a/doc/tools.md
+++ b/doc/tools.md
@ -0,0 +1,512 @@
+# WebP tools
+
+## Encoding tool
+
+The examples/ directory contains tools for encoding (cwebp) and decoding (dwebp)
+images.
+
+The easiest use should look like:
+
+```shell
+cwebp input.png -q 80 -o output.webp
+```
+
+which will convert the input file to a WebP file using a quality factor of 80 on
+a 0->100 scale (0 being the lowest quality, 100 being the best. Default value is
+75).
+
+You might want to try the `-lossless` flag too, which will compress the source
+(in RGBA format) without any loss. The `-q` quality parameter will in this case
+control the amount of processing time spent trying to make the output file as
+small as possible.
+
+A longer list of options is available using the `-longhelp` command line flag:
+
+```shell
+> cwebp -longhelp
+Usage:
+ cwebp [-preset <...>] [options] in_file [-o out_file]
+```
+
+If input size (-s) for an image is not specified, it is assumed to be a PNG,
+JPEG, TIFF or WebP file. Note: Animated PNG and WebP files are not supported.
+
+Options:
+
+```
+-h / -help ............. short help
+-H / -longhelp ......... long help
+-q <float> ............. quality factor (0:small..100:big), default=75
+-alpha_q <int> ......... transparency-compression quality (0..100),
+                         default=100
+-preset <string> ....... preset setting, one of:
+                          default, photo, picture,
+                          drawing, icon, text
+   -preset must come first, as it overwrites other parameters
+-z <int> ............... activates lossless preset with given
+                         level in [0:fast, ..., 9:slowest]
+
+-m <int> ............... compression method (0=fast, 6=slowest), default=4
+-segments <int> ........ number of segments to use (1..4), default=4
+-size <int> ............ target size (in bytes)
+-psnr <float> .......... target PSNR (in dB. typically: 42)
+
+-s <int> <int> ......... input size (width x height) for YUV
+-sns <int> ............. spatial noise shaping (0:off, 100:max), default=50
+-f <int> ............... filter strength (0=off..100), default=60
+-sharpness <int> ....... filter sharpness (0:most .. 7:least sharp), default=0
+-strong ................ use strong filter instead of simple (default)
+-nostrong .............. use simple filter instead of strong
+-sharp_yuv ............. use sharper (and slower) RGB->YUV conversion
+-partition_limit <int> . limit quality to fit the 512k limit on
+                         the first partition (0=no degradation ... 100=full)
+-pass <int> ............ analysis pass number (1..10)
+-qrange <min> <max> .... specifies the permissible quality range
+                         (default: 0 100)
+-crop <x> <y> <w> <h> .. crop picture with the given rectangle
+-resize <w> <h> ........ resize picture (*after* any cropping)
+-mt .................... use multi-threading if available
+-low_memory ............ reduce memory usage (slower encoding)
+-map <int> ............. print map of extra info
+-print_psnr ............ prints averaged PSNR distortion
+-print_ssim ............ prints averaged SSIM distortion
+-print_lsim ............ prints local-similarity distortion
+-d <file.pgm> .......... dump the compressed output (PGM file)
+-alpha_method <int> .... transparency-compression method (0..1), default=1
+-alpha_filter <string> . predictive filtering for alpha plane,
+                         one of: none, fast (default) or best
+-exact ................. preserve RGB values in transparent area, default=off
+-blend_alpha <hex> ..... blend colors against background color
+                         expressed as RGB values written in
+                         hexadecimal, e.g. 0xc0e0d0 for red=0xc0
+                         green=0xe0 and blue=0xd0
+-noalpha ............... discard any transparency information
+-lossless .............. encode image losslessly, default=off
+-near_lossless <int> ... use near-lossless image
+                         preprocessing (0..100=off), default=100
+-hint <string> ......... specify image characteristics hint,
+                         one of: photo, picture or graph
+
+-metadata <string> ..... comma separated list of metadata to
+                         copy from the input to the output if present.
+                         Valid values: all, none (default), exif, icc, xmp
+
+-short ................. condense printed message
+-quiet ................. don't print anything
+-version ............... print version number and exit
+-noasm ................. disable all assembly optimizations
+-v ..................... verbose, e.g. print encoding/decoding times
+-progress .............. report encoding progress
+```
+
+Experimental Options:
+
+```
+-jpeg_like ............. roughly match expected JPEG size
+-af .................... auto-adjust filter strength
+-pre <int> ............. pre-processing filter
+```
+
+The main options you might want to try in order to further tune the visual
+quality are:
+
+-preset -sns -f -m
+
+Namely:
+
+*   `preset` will set up a default encoding configuration targeting a particular
+    type of input. It should appear first in the list of options, so that
+    subsequent options can take effect on top of this preset. Default value is
+    'default'.
+*   `sns` will progressively turn on (when going from 0 to 100) some additional
+    visual optimizations (like: segmentation map re-enforcement). This option
+    will balance the bit allocation differently. It tries to take bits from the
+    "easy" parts of the picture and use them in the "difficult" ones instead.
+    Usually, raising the sns value (at fixed -q value) leads to larger files,
+    but with better quality. Typical value is around '75'.
+*   `f` option directly links to the filtering strength used by the codec's
+    in-loop processing. The higher the value, the smoother the highly-compressed
+    area will look. This is particularly useful when aiming at very small files.
+    Typical values are around 20-30. Note that using the option
+    -strong/-nostrong will change the type of filtering. Use "-f 0" to turn
+    filtering off.
+*   `m` controls the trade-off between encoding speed and quality. Default is 4.
+    You can try -m 5 or -m 6 to explore more (time-consuming) encoding
+    possibilities. A lower value will result in faster encoding at the expense
+    of quality.
+
+## Decoding tool
+
+There is a decoding sample in examples/dwebp.c which will take a .webp file and
+decode it to a PNG image file (amongst other formats). This is simply to
+demonstrate the use of the API. You can verify the file test.webp decodes to
+exactly the same as test_ref.ppm by using:
+
+```shell
+cd examples
+./dwebp test.webp -ppm -o test.ppm
+diff test.ppm test_ref.ppm
+```
+
+The full list of options is available using -h:
+
+```shell
+> dwebp -h
+Usage: dwebp in_file [options] [-o out_file]
+```
+
+Decodes the WebP image file to PNG format [Default]. Note: Animated WebP files
+are not supported.
+
+Use following options to convert into alternate image formats:
+
+```
+-pam ......... save the raw RGBA samples as a color PAM
+-ppm ......... save the raw RGB samples as a color PPM
+-bmp ......... save as uncompressed BMP format
+-tiff ........ save as uncompressed TIFF format
+-pgm ......... save the raw YUV samples as a grayscale PGM
+               file with IMC4 layout
+-yuv ......... save the raw YUV samples in flat layout
+```
+
+Other options are:
+
+```
+-version ..... print version number and exit
+-nofancy ..... don't use the fancy YUV420 upscaler
+-nofilter .... disable in-loop filtering
+-nodither .... disable dithering
+-dither <d> .. dithering strength (in 0..100)
+-alpha_dither  use alpha-plane dithering if needed
+-mt .......... use multi-threading
+-crop <x> <y> <w> <h> ... crop output with the given rectangle
+-resize <w> <h> ......... resize output (*after* any cropping)
+-flip ........ flip the output vertically
+-alpha ....... only save the alpha plane
+-incremental . use incremental decoding (useful for tests)
+-h ........... this help message
+-v ........... verbose (e.g. print encoding/decoding times)
+-quiet ....... quiet mode, don't print anything
+-noasm ....... disable all assembly optimizations
+```
+
+## WebP file analysis tool
+
+`webpinfo` can be used to print out the chunk level structure and bitstream
+header information of WebP files. It can also check if the files are of valid
+WebP format.
+
+Usage:
+
+```shell
+webpinfo [options] in_files
+```
+
+Note: there could be multiple input files; options must come before input files.
+
+Options:
+
+```
+-version ........... Print version number and exit.
+-quiet ............. Do not show chunk parsing information.
+-diag .............. Show parsing error diagnosis.
+-summary ........... Show chunk stats summary.
+-bitstream_info .... Parse bitstream header.
+```
+
+## Visualization tool
+
+There's a little self-serve visualization tool called 'vwebp' under the
+examples/ directory. It uses OpenGL to open a simple drawing window and show a
+decoded WebP file. It's not yet integrated in the automake build system, but you
+can try to manually compile it using the recommendations below.
+
+Usage:
+
+```shell
+vwebp in_file [options]
+```
+
+Decodes the WebP image file and visualize it using OpenGL
+
+Options are:
+
+```
+-version ..... print version number and exit
+-noicc ....... don't use the icc profile if present
+-nofancy ..... don't use the fancy YUV420 upscaler
+-nofilter .... disable in-loop filtering
+-dither <int>  dithering strength (0..100), default=50
+-noalphadither disable alpha plane dithering
+-usebgcolor .. display background color
+-mt .......... use multi-threading
+-info ........ print info
+-h ........... this help message
+```
+
+Keyboard shortcuts:
+
+```
+'c' ................ toggle use of color profile
+'b' ................ toggle background color display
+'i' ................ overlay file information
+'d' ................ disable blending & disposal (debug)
+'q' / 'Q' / ESC .... quit
+```
+
+### Building
+
+Prerequisites:
+
+1.  OpenGL & OpenGL Utility Toolkit (GLUT)
+
+    Linux: `sudo apt-get install freeglut3-dev mesa-common-dev`
+
+    Mac + Xcode: These libraries should be available in the OpenGL / GLUT
+    frameworks.
+
+    Windows: http://freeglut.sourceforge.net/index.php#download
+
+2.  (Optional) qcms (Quick Color Management System)
+
+    1.  Download qcms from Mozilla / Chromium:
+        https://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+        https://source.chromium.org/chromium/chromium/src/+/main:third_party/qcms/;drc=d4a2f8e1ed461d8fc05ed88d1ae2dc94c9773825
+    2.  Build and archive the source files as libqcms.a / qcms.lib
+    3.  Update makefile.unix / Makefile.vc
+        1.  Define WEBP_HAVE_QCMS
+        2.  Update include / library paths to reference the qcms directory.
+
+Build using makefile.unix / Makefile.vc:
+
+```shell
+$ make -f makefile.unix examples/vwebp
+> nmake /f Makefile.vc CFG=release-static \
+    ../obj/x64/release-static/bin/vwebp.exe
+```
+
+## Animation creation tool
+
+The utility `img2webp` can turn a sequence of input images (PNG, JPEG, ...) into
+an animated WebP file. It offers fine control over duration, encoding modes,
+etc.
+
+Usage:
+
+```shell
+img2webp [file_options] [[frame_options] frame_file]...
+```
+
+File-level options (only used at the start of compression):
+
+```
+-min_size ............ minimize size
+-loop <int> .......... loop count (default: 0, = infinite loop)
+-kmax <int> .......... maximum number of frame between key-frames
+                        (0=only keyframes)
+-kmin <int> .......... minimum number of frame between key-frames
+                        (0=disable key-frames altogether)
+-mixed ............... use mixed lossy/lossless automatic mode
+-v ................... verbose mode
+-h ................... this help
+-version ............. print version number and exit
+```
+
+Per-frame options (only used for subsequent images input):
+
+```
+-d <int> ............. frame duration in ms (default: 100)
+-lossless  ........... use lossless mode (default)
+-lossy ... ........... use lossy mode
+-q <float> ........... quality
+-m <int> ............. method to use
+```
+
+example: `img2webp -loop 2 in0.png -lossy in1.jpg -d 80 in2.tiff -o out.webp`
+
+Note: if a single file name is passed as the argument, the arguments will be
+tokenized from this file. The file name must not start with the character '-'.
+
+## Animated GIF conversion
+
+Animated GIF files can be converted to WebP files with animation using the
+gif2webp utility available under examples/. The files can then be viewed using
+vwebp.
+
+Usage:
+
+```shell
+gif2webp [options] gif_file -o webp_file
+```
+
+Options:
+
+```
+-h / -help ............. this help
+-lossy ................. encode image using lossy compression
+-mixed ................. for each frame in the image, pick lossy
+                         or lossless compression heuristically
+-q <float> ............. quality factor (0:small..100:big)
+-m <int> ............... compression method (0=fast, 6=slowest)
+-min_size .............. minimize output size (default:off)
+                         lossless compression by default; can be
+                         combined with -q, -m, -lossy or -mixed
+                         options
+-kmin <int> ............ min distance between key frames
+-kmax <int> ............ max distance between key frames
+-f <int> ............... filter strength (0=off..100)
+-metadata <string> ..... comma separated list of metadata to
+                         copy from the input to the output if present
+                         Valid values: all, none, icc, xmp (default)
+-loop_compatibility .... use compatibility mode for Chrome
+                         version prior to M62 (inclusive)
+-mt .................... use multi-threading if available
+
+-version ............... print version number and exit
+-v ..................... verbose
+-quiet ................. don't print anything
+```
+
+### Building
+
+With the libgif development files installed, gif2webp can be built using
+makefile.unix:
+
+```shell
+$ make -f makefile.unix examples/gif2webp
+```
+
+or using autoconf:
+
+```shell
+$ ./configure --enable-everything
+$ make
+```
+
+## Comparison of animated images
+
+Test utility anim_diff under examples/ can be used to compare two animated
+images (each can be GIF or WebP).
+
+Usage:
+
+```shell
+anim_diff <image1> <image2> [options]
+```
+
+Options:
+
+```
+-dump_frames <folder> dump decoded frames in PAM format
+-min_psnr <float> ... minimum per-frame PSNR
+-raw_comparison ..... if this flag is not used, RGB is
+                      premultiplied before comparison
+-max_diff <int> ..... maximum allowed difference per channel
+                      between corresponding pixels in subsequent
+                      frames
+-h .................. this help
+-version ............ print version number and exit
+```
+
+### Building
+
+With the libgif development files installed, anim_diff can be built using
+makefile.unix:
+
+```shell
+$ make -f makefile.unix examples/anim_diff
+```
+
+or using autoconf:
+
+```shell
+$ ./configure --enable-everything
+$ make
+```
+
+## WebP Mux tool
+
+The examples/ directory contains a tool (webpmux) for manipulating WebP files.
+The webpmux tool can be used to create an extended format WebP file and also to
+extract or strip relevant data from such a file.
+
+A list of options is available using the -help command line flag:
+
+```shell
+> webpmux -help
+Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
+       webpmux -set SET_OPTIONS INPUT -o OUTPUT
+       webpmux -duration DURATION_OPTIONS [-duration ...]
+               INPUT -o OUTPUT
+       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT
+       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]
+               [-bgcolor BACKGROUND_COLOR] -o OUTPUT
+       webpmux -info INPUT
+       webpmux [-h|-help]
+       webpmux -version
+       webpmux argument_file_name
+
+GET_OPTIONS:
+ Extract relevant data:
+   icc       get ICC profile
+   exif      get EXIF metadata
+   xmp       get XMP metadata
+   frame n   get nth frame
+
+SET_OPTIONS:
+ Set color profile/metadata/parameters:
+   loop LOOP_COUNT            set the loop count
+   bgcolor BACKGROUND_COLOR   set the animation background color
+   icc  file.icc              set ICC profile
+   exif file.exif             set EXIF metadata
+   xmp  file.xmp              set XMP metadata
+   where:    'file.icc' contains the ICC profile to be set,
+             'file.exif' contains the EXIF metadata to be set
+             'file.xmp' contains the XMP metadata to be set
+
+DURATION_OPTIONS:
+ Set duration of selected frames:
+   duration            set duration for all frames
+   duration,frame      set duration of a particular frame
+   duration,start,end  set duration of frames in the
+                        interval [start,end])
+   where: 'duration' is the duration in milliseconds
+          'start' is the start frame index
+          'end' is the inclusive end frame index
+           The special 'end' value '0' means: last frame.
+
+STRIP_OPTIONS:
+ Strip color profile/metadata:
+   icc       strip ICC profile
+   exif      strip EXIF metadata
+   xmp       strip XMP metadata
+
+FRAME_OPTIONS(i):
+ Create animation:
+   file_i +di[+xi+yi[+mi[bi]]]
+   where:    'file_i' is the i'th animation frame (WebP format),
+             'di' is the pause duration before next frame,
+             'xi','yi' specify the image offset for this frame,
+             'mi' is the dispose method for this frame (0 or 1),
+             'bi' is the blending method for this frame (+b or -b)
+
+LOOP_COUNT:
+ Number of times to repeat the animation.
+ Valid range is 0 to 65535 [Default: 0 (infinite)].
+
+BACKGROUND_COLOR:
+ Background color of the canvas.
+  A,R,G,B
+  where:    'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying
+            the Alpha, Red, Green and Blue component values respectively
+            [Default: 255,255,255,255]
+
+INPUT & OUTPUT are in WebP format.
+
+Note: The nature of EXIF, XMP and ICC data is not checked and is assumed to be
+valid.
+
+Note: if a single file name is passed as the argument, the arguments will be
+tokenized from this file. The file name must not start with the character '-'.
+```
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -4,8 +4,8 @@ Although you may be viewing an alternate representation, this document
 is sourced in Markdown, a light-duty markup scheme, and is optimized for
 the [kramdown](https://kramdown.gettalong.org/) transformer.

-See the accompanying README. External link targets are referenced at the
-end of this file.
+See the accompanying specs_generation.md. External link targets are referenced
+at the end of this file.

 -->

@ -36,7 +36,7 @@ for:
  * **Lossless compression.** An image can be losslessly compressed, using the
    WebP Lossless Format.

-  * **Metadata.** An image may have metadata stored in EXIF or XMP formats.
+  * **Metadata.** An image may have metadata stored in Exif or XMP formats.

  * **Transparency.** An image may have transparency, i.e., an alpha channel.

@ -94,7 +94,7 @@ _1-based_
 RIFF File Format
 ----------------

-The WebP file format is based on the RIFF (resource interchange file format)
+The WebP file format is based on the RIFF (Resource Interchange File Format)
 document format.

 The basic element of a RIFF file is a _chunk_. It consists of:
@ -261,7 +261,7 @@ An extended format file consists of:

  * Image data.

-  * An optional 'EXIF' chunk with EXIF metadata.
+  * An optional 'EXIF' chunk with Exif metadata.

  * An optional 'XMP ' chunk with XMP metadata.

@ -317,9 +317,9 @@ Alpha (L): 1 bit
 : Set if any of the frames of the image contain transparency information
  ("alpha").

-EXIF metadata (E): 1 bit
+Exif metadata (E): 1 bit

-: Set if the file contains EXIF metadata.
+: Set if the file contains Exif metadata.

 XMP metadata (X): 1 bit

@ -341,12 +341,12 @@ Reserved: 24 bits
 Canvas Width Minus One: 24 bits

 : _1-based_ width of the canvas in pixels.
-  The actual canvas width is '1 + Canvas Width Minus One'
+  The actual canvas width is `1 + Canvas Width Minus One`.

 Canvas Height Minus One: 24 bits

 : _1-based_ height of the canvas in pixels.
-  The actual canvas height is '1 + Canvas Height Minus One'
+  The actual canvas height is `1 + Canvas Height Minus One`.

 The product of _Canvas Width_ and _Canvas Height_ MUST be at most `2^32 - 1`.

@ -423,21 +423,21 @@ If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.

 Frame X: 24 bits (_uint24_)

-: The X coordinate of the upper left corner of the frame is `Frame X * 2`
+: The X coordinate of the upper left corner of the frame is `Frame X * 2`.

 Frame Y: 24 bits (_uint24_)

-: The Y coordinate of the upper left corner of the frame is `Frame Y * 2`
+: The Y coordinate of the upper left corner of the frame is `Frame Y * 2`.

 Frame Width Minus One: 24 bits (_uint24_)

 : The _1-based_ width of the frame.
-  The frame width is `1 + Frame Width Minus One`
+  The frame width is `1 + Frame Width Minus One`.

 Frame Height Minus One: 24 bits (_uint24_)

 : The _1-based_ height of the frame.
-  The frame height is `1 + Frame Height Minus One`
+  The frame height is `1 + Frame Height Minus One`.

 Frame Duration: 24 bits (_uint24_)

@ -677,12 +677,12 @@ EXIF chunk:
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                      ChunkHeader('EXIF')                      |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                        EXIF Metadata                          |
+    |                        Exif Metadata                          |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

-EXIF Metadata: _Chunk Size_ bytes
+Exif Metadata: _Chunk Size_ bytes

-: image metadata in EXIF format.
+: image metadata in Exif format.

 XMP chunk:

@ -798,7 +798,7 @@ RIFF/WEBP
 +- XMP  (metadata)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-An animated image with EXIF metadata may look as follows:
+An animated image with Exif metadata may look as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 RIFF/WEBP
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
@ -4,8 +4,8 @@ Although you may be viewing an alternate representation, this document
 is sourced in Markdown, a light-duty markup scheme, and is optimized for
 the [kramdown](https://kramdown.gettalong.org/) transformer.

-See the accompanying README. External link targets are referenced at the
-end of this file.
+See the accompanying specs_generation.md. External link targets are referenced
+at the end of this file.

 -->

@ -16,6 +16,8 @@ _Jyrki Alakuijala, Ph.D., Google, Inc., 2012-06-19_

 Paragraphs marked as \[AMENDED\] were amended on 2014-09-16.

+Paragraphs marked as \[AMENDED2\] were amended on 2022-05-13.
+
 Abstract
 --------

@ -25,7 +27,7 @@ exactly, including the color values for zero alpha pixels. The
 format uses subresolution images, recursively embedded into the format
 itself, for storing statistical data about the images, such as the used
 entropy codes, spatial predictors, color space conversion, and color
-table. LZ77, Huffman coding, and a color cache are used for compression
+table. LZ77, prefix coding, and a color cache are used for compression
 of the bulk data. Decoding speeds faster than PNG have been
 demonstrated, as well as 25% denser compression than can be achieved
 using today's PNG format.
@ -63,9 +65,9 @@ distance mapping
 entropy image
 : A two-dimensional subresolution image indicating which entropy coding
  should be used in a respective square in the image, i.e., each pixel
-  is a meta Huffman code.
+  is a meta prefix code.

-Huffman code
+prefix code
 : A classic way to do entropy coding where a smaller number of bits are
  used for more frequent codes.

@ -73,9 +75,9 @@ LZ77
 : Dictionary-based sliding window compression algorithm that either
  emits symbols or describes them as sequences of past symbols.

-meta Huffman code
+meta prefix code
 : A small integer (up to 16 bits) that indexes an element in the meta
-  Huffman table.
+  prefix table.

 predictor image
 : A two-dimensional subresolution image indicating which spatial
@ -235,7 +237,7 @@ transform, the current pixel value is predicted from the pixels already
 decoded (in scan-line order) and only the residual value (actual -
 predicted) is encoded. The _prediction mode_ determines the type of
 prediction to use. We divide the image into squares and all the pixels
-in a square use same prediction mode.
+in a square use the same prediction mode.

 The first 3 bits of prediction data define the block width and height in
 number of bits. The number of block columns, `block_xsize`, is used in
@ -367,15 +369,17 @@ the predicted value for the left-topmost pixel of the image is
 0xff000000, L-pixel for all pixels on the top row, and T-pixel for all
 pixels on the leftmost column.

+\[AMENDED2\]
 Addressing the TR-pixel for pixels on the rightmost column is
 exceptional. The pixels on the rightmost column are predicted by using
-the modes \[0..13\] just like pixels not on border, but by using the
-leftmost pixel on the same row as the current TR-pixel. The TR-pixel
-offset in memory is the same for border and non-border pixels.
+the modes \[0..13\] just like pixels not on the border, but the leftmost pixel
+on the same row as the current pixel is instead used as the TR-pixel.


 ### Color Transform

+\[AMENDED2\]
+
 The goal of the color transform is to decorrelate the R, G and B values
 of each pixel. Color transform keeps the green (G) value as it is,
 transforms red (R) based on green and transforms blue (B) based on green
@ -396,8 +400,8 @@ typedef struct {
 The actual color transformation is done by defining a color transform
 delta. The color transform delta depends on the `ColorTransformElement`,
 which is the same for all the pixels in a particular block. The delta is
-added during color transform. The inverse color transform then is just
-subtracting those deltas.
+subtracted during color transform. The inverse color transform then is just
+adding those deltas.

 The color transform function is defined as follows:

@ -406,13 +410,13 @@ void ColorTransform(uint8 red, uint8 blue, uint8 green,
                    ColorTransformElement *trans,
                    uint8 *new_red, uint8 *new_blue) {
  // Transformed values of red and blue components
-  uint32 tmp_red = red;
-  uint32 tmp_blue = blue;
+  int tmp_red = red;
+  int tmp_blue = blue;

-  // Applying transform is just adding the transform deltas
-  tmp_red  += ColorTransformDelta(trans->green_to_red, green);
-  tmp_blue += ColorTransformDelta(trans->green_to_blue, green);
-  tmp_blue += ColorTransformDelta(trans->red_to_blue, red);
+  // Applying the transform is just subtracting the transform deltas
+  tmp_red  -= ColorTransformDelta(p->green_to_red_,  green);
+  tmp_blue -= ColorTransformDelta(p->green_to_blue_, green);
+  tmp_blue -= ColorTransformDelta(p->red_to_blue_, red);

  *new_red = tmp_red & 0xff;
  *new_blue = tmp_blue & 0xff;
@ -430,7 +434,7 @@ int8 ColorTransformDelta(int8 t, int8 c) {
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 A conversion from the 8-bit unsigned representation (uint8) to the 8-bit
-signed one (int8) is required before calling ColorTransformDelta().
+signed one (int8) is required before calling `ColorTransformDelta()`.
 It should be performed using 8-bit two's complement (that is: uint8 range
 \[128-255\] is mapped to the \[-128, -1\] range of its converted int8 value).

@ -468,14 +472,18 @@ channels.
 void InverseTransform(uint8 red, uint8 green, uint8 blue,
                      ColorTransformElement *p,
                      uint8 *new_red, uint8 *new_blue) {
-  // Applying inverse transform is just subtracting the
-  // color transform deltas
-  red  -= ColorTransformDelta(p->green_to_red_,  green);
-  blue -= ColorTransformDelta(p->green_to_blue_, green);
-  blue -= ColorTransformDelta(p->red_to_blue_, red & 0xff);
+  // Transformed values of red and blue components
+  int tmp_red = red;
+  int tmp_blue = blue;

-  *new_red = red & 0xff;
-  *new_blue = blue & 0xff;
+  // Applying inverse transform is just adding the
+  // color transform deltas
+  tmp_red  += ColorTransformDelta(trans->green_to_red, green);
+  tmp_blue += ColorTransformDelta(trans->green_to_blue, green);
+  tmp_blue += ColorTransformDelta(trans->red_to_blue, tmp_red & 0xff);
+
+  *new_red = tmp_red & 0xff;
+  *new_blue = tmp_blue & 0xff;
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -590,12 +598,12 @@ The values are packed into the green component as follows:
    4 most-significant bits of the green value at x / 2.
  * `width_bits` = 2: for every x value where x ≡ 0 (mod 4), a green
    value at x is positioned into the 2 least-significant bits of the
-    green value at x / 4, green values at x + 1 to x + 3 in order to the
-    more significant bits of the green value at x / 4.
+    green value at x / 4, green values at x + 1 to x + 3 are positioned in order
+    to the more significant bits of the green value at x / 4.
  * `width_bits` = 3: for every x value where x ≡ 0 (mod 8), a green
    value at x is positioned into the least-significant bit of the green
-    value at x / 8, green values at x + 1 to x + 7 in order to the more
-    significant bits of the green value at x / 8.
+    value at x / 8, green values at x + 1 to x + 7 are positioned in order to
+    the more significant bits of the green value at x / 8.


 4 Image Data
@ -609,8 +617,8 @@ We use image data in five different roles:

  1. ARGB image: Stores the actual pixels of the image.
  1. Entropy image: Stores the
-     [meta Huffman codes](#decoding-of-meta-huffman-codes). The red and green
-     components of a pixel define the meta Huffman code used in a particular
+     [meta prefix codes](#decoding-of-meta-prefix-codes). The red and green
+     components of a pixel define the meta prefix code used in a particular
     block of the ARGB image.
  1. Predictor image: Stores the metadata for [Predictor
     Transform](#predictor-transform). The green component of a pixel defines
@ -621,12 +629,12 @@ We use image data in five different roles:
     the image. Each `ColorTransformElement` `'cte'` is treated as a pixel whose
     alpha component is `255`, red component is `cte.red_to_blue`, green
     component is `cte.green_to_blue` and blue component is `cte.green_to_red`.
-  1. Color indexing image: An array of of size `color_table_size` (up to 256
+  1. Color indexing image: An array of size `color_table_size` (up to 256
     ARGB values) storing the metadata for the
     [Color Indexing Transform](#color-indexing-transform). This is stored as an
     image of width `color_table_size` and height `1`.

-### 4.2 Encoding of Image data
+### 4.2 Encoding of Image Data

 The encoding of image data is independent of its role.

@ -643,7 +651,7 @@ the image.

 Each pixel is encoded using one of the three possible methods:

-  1. Huffman coded literal: each channel (green, red, blue and alpha) is
+  1. prefix coded literal: each channel (green, red, blue and alpha) is
     entropy-coded independently;
  2. LZ77 backward reference: a sequence of pixels are copied from elsewhere
     in the image; or
@ -652,9 +660,9 @@ Each pixel is encoded using one of the three possible methods:

 The following sub-sections describe each of these in detail.

-#### 4.2.1 Huffman Coded Literals
+#### 4.2.1 Prefix Coded Literals

-The pixel is stored as Huffman coded values of green, red, blue and alpha (in
+The pixel is stored as prefix coded values of green, red, blue and alpha (in
 that order). See [this section](#decoding-entropy-coded-image-data) for details.

 #### 4.2.2 LZ77 Backward Reference
@ -678,7 +686,7 @@ very few values in the image. Thus, this approach results in a better
 compression overall.

 The following table denotes the prefix codes and extra bits used for storing
-different range of values.
+different ranges of values.

 Note: The maximum backward reference length is limited to 4096. Hence, only the
 first 24 prefix codes (with the respective extra bits) are meaningful for length
@ -753,13 +761,13 @@ The mapping between distance code `i` and the neighboring pixel offset
 (-6, 7), (7, 6),  (-7, 6), (8, 5),  (7, 7),  (-7, 7), (8, 6),  (8, 7)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-For example, distance code `1` indicates offset of `(0, 1)` for the neighboring
-pixel, that is, the pixel above the current pixel (0-pixel difference in
-X-direction and 1 pixel difference in Y-direction). Similarly, distance code
-`3` indicates left-top pixel.
+For example, distance code `1` indicates an offset of `(0, 1)` for the
+neighboring pixel, that is, the pixel above the current pixel (0 pixel
+difference in X-direction and 1 pixel difference in Y-direction). Similarly,
+distance code `3` indicates left-top pixel.

-The decoder can convert a distances code 'i' to a scan-line order distance
-'dist' as follows:
+The decoder can convert a distance code `i` to a scan-line order distance
+`dist` as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 (xi, yi) = distance_map[i]
@ -769,21 +777,22 @@ if (dist < 1) {
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-where 'distance_map' is the mapping noted above and `xsize` is the width of the
+where `distance_map` is the mapping noted above and `xsize` is the width of the
 image in pixels.


 #### 4.2.3 Color Cache Coding
+{:#color-cache-code}

 Color cache stores a set of colors that have been recently used in the image.

 **Rationale:** This way, the recently used colors can sometimes be referred to
-more efficiently than emitting them using other two methods (described in
-[4.2.1](#huffman-coded-literals) and [4.2.2](#lz77-backward-reference)).
+more efficiently than emitting them using the other two methods (described in
+[4.2.1](#prefix-coded-literals) and [4.2.2](#lz77-backward-reference)).

 Color cache codes are stored as follows. First, there is a 1-bit value that
 indicates if the color cache is used. If this bit is 0, no color cache codes
-exist, and they are not transmitted in the Huffman code that decodes the green
+exist, and they are not transmitted in the prefix code that decodes the green
 symbols and the length prefix codes. However, if this bit is 1, the color cache
 size is read next:

@ -814,130 +823,245 @@ literals, into the cache in the order they appear in the stream.

 ### 5.1 Overview

-Most of the data is coded using [canonical Huffman code][canonical_huff]. Hence,
-the codes are transmitted by sending the _Huffman code lengths_, as opposed to
-the actual _Huffman codes_.
+Most of the data is coded using a [canonical prefix code][canonical_huff].
+Hence, the codes are transmitted by sending the _prefix code lengths_, as
+opposed to the actual _prefix codes_.

-In particular, the format uses **spatially-variant Huffman coding**. In other
+In particular, the format uses **spatially-variant prefix coding**. In other
 words, different blocks of the image can potentially use different entropy
 codes.

-**Rationale**: Different areas of the image may have different characteristics. So, allowing them to use different entropy codes provides more flexibility and
-potentially a better compression.
+**Rationale**: Different areas of the image may have different characteristics.
+So, allowing them to use different entropy codes provides more flexibility and
+potentially better compression.

 ### 5.2 Details

-The encoded image data consists of two parts:
+The encoded image data consists of several parts:

-  1. Meta Huffman codes
+  1. Decoding and building the prefix codes \[AMENDED2\]
+  1. Meta prefix codes
  1. Entropy-coded image data

-#### 5.2.1 Decoding of Meta Huffman Codes
+#### 5.2.1 Decoding and Building the Prefix Codes

-As noted earlier, the format allows the use of different Huffman codes for
-different blocks of the image. _Meta Huffman codes_ are indexes identifying
-which Huffman codes to use in different parts of the image.
+There are several steps in decoding the prefix codes.

-Meta Huffman codes may be used _only_ when the image is being used in the
+**Decoding the Code Lengths:**
+{:#decoding-the-code-lengths}
+
+This section describes how to read the prefix code lengths from the bitstream.
+
+The prefix code lengths can be coded in two ways. The method used is specified
+by a 1-bit value.
+
+  * If this bit is 1, it is a _simple code length code_, and
+  * If this bit is 0, it is a _normal code length code_.
+
+In both cases, there can be unused code lengths that are still part of the
+stream. This may be inefficient, but it is allowed by the format.
+
+**(i) Simple Code Length Code:**
+
+\[AMENDED2\]
+
+This variant is used in the special case when only 1 or 2 prefix symbols are
+in the range \[0..255\] with code length `1`. All other prefix code lengths
+are implicitly zeros.
+
+The first bit indicates the number of symbols:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int num_symbols = ReadBits(1) + 1;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Following are the symbol values.
+This first symbol is coded using 1 or 8 bits depending on the value of
+`is_first_8bits`. The range is \[0..1\] or \[0..255\], respectively.
+The second symbol, if present, is always assumed to be in the range \[0..255\]
+and coded using 8 bits.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int is_first_8bits = ReadBits(1);
+symbol0 = ReadBits(1 + 7 * is_first_8bits);
+code_lengths[symbol0] = 1;
+if (num_symbols == 2) {
+  symbol1 = ReadBits(8);
+  code_lengths[symbol1] = 1;
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Note:** Another special case is when _all_ prefix code lengths are _zeros_
+(an empty prefix code). For example, a prefix code for distance can be empty
+if there are no backward references. Similarly, prefix codes for alpha, red,
+and blue can be empty if all pixels within the same meta prefix code are
+produced using the color cache. However, this case doesn't need a special
+handling, as empty prefix codes can be coded as those containing a single
+symbol `0`.
+
+**(ii) Normal Code Length Code:**
+
+The code lengths of the prefix code fit in 8 bits and are read as follows.
+First, `num_code_lengths` specifies the number of code lengths.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int num_code_lengths = 4 + ReadBits(4);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If `num_code_lengths` is > 18, the bitstream is invalid.
+
+The code lengths are themselves encoded using prefix codes: lower level code
+lengths `code_length_code_lengths` first have to be read. The rest of those
+`code_length_code_lengths` (according to the order in `kCodeLengthCodeOrder`)
+are zeros.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int kCodeLengthCodes = 19;
+int kCodeLengthCodeOrder[kCodeLengthCodes] = {
+  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+int code_length_code_lengths[kCodeLengthCodes] = { 0 };  // All zeros.
+for (i = 0; i < num_code_lengths; ++i) {
+  code_length_code_lengths[kCodeLengthCodeOrder[i]] = ReadBits(3);
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Next, if `ReadBits(1) == 0`, the maximum number of different read symbols is
+`num_code_lengths`. Otherwise, it is defined as:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int length_nbits = 2 + 2 * ReadBits(3);
+int max_symbol = 2 + ReadBits(length_nbits);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A prefix table is then built from `code_length_code_lengths` and used to read
+up to `max_symbol` code lengths.
+
+  * Code \[0..15\] indicates literal code lengths.
+    * Value 0 means no symbols have been coded.
+    * Values \[1..15\] indicate the bit length of the respective code.
+  * Code 16 repeats the previous non-zero value \[3..6\] times, i.e.,
+    `3 + ReadBits(2)` times. If code 16 is used before a non-zero
+    value has been emitted, a value of 8 is repeated.
+  * Code 17 emits a streak of zeros \[3..10\], i.e., `3 + ReadBits(3)`
+    times.
+  * Code 18 emits a streak of zeros of length \[11..138\], i.e.,
+    `11 + ReadBits(7)` times.
+
+Once code lengths are read, a prefix code for each symbol type (A, R, G, B,
+distance) is formed using their respective alphabet sizes:
+
+  * G channel: 256 + 24 + `color_cache_size`
+  * other literals (A,R,B): 256
+  * distance code: 40
+
+#### 5.2.2 Decoding of Meta Prefix Codes
+
+As noted earlier, the format allows the use of different prefix codes for
+different blocks of the image. _Meta prefix codes_ are indexes identifying
+which prefix codes to use in different parts of the image.
+
+Meta prefix codes may be used _only_ when the image is being used in the
 [role](#roles-of-image-data) of an _ARGB image_.

-There are two possibilities for the meta Huffman codes, indicated by a 1-bit
+There are two possibilities for the meta prefix codes, indicated by a 1-bit
 value:

-  * If this bit is zero, there is only one meta Huffman code used everywhere in
+  * If this bit is zero, there is only one meta prefix code used everywhere in
    the image. No more data is stored.
-  * If this bit is one, the image uses multiple meta Huffman codes. These meta
-    Huffman codes are stored as an _entropy image_ (described below).
+  * If this bit is one, the image uses multiple meta prefix codes. These meta
+    prefix codes are stored as an _entropy image_ (described below).

 **Entropy image:**

-The entropy image defines which Huffman codes are used in different parts of the
+The entropy image defines which prefix codes are used in different parts of the
 image, as described below.

-The first 3-bits contain the `huffman_bits` value. The dimensions of the entropy
-image are derived from 'huffman_bits'.
+The first 3-bits contain the `prefix_bits` value. The dimensions of the entropy
+image are derived from 'prefix_bits'.

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int huffman_bits = ReadBits(3) + 2;
-int huffman_xsize = DIV_ROUND_UP(xsize, 1 << huffman_bits);
-int huffman_ysize = DIV_ROUND_UP(ysize, 1 << huffman_bits);
+int prefix_bits = ReadBits(3) + 2;
+int prefix_xsize = DIV_ROUND_UP(xsize, 1 << prefix_bits);
+int prefix_ysize = DIV_ROUND_UP(ysize, 1 << prefix_bits);
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 where `DIV_ROUND_UP` is as defined [earlier](#predictor-transform).

-Next bits contain an entropy image of width `huffman_xsize` and height
-`huffman_ysize`.
+The next bits contain an entropy image of width `prefix_xsize` and height
+`prefix_ysize`.

-**Interpretation of Meta Huffman Codes:**
+**Interpretation of Meta Prefix Codes:**

-For any given pixel (x, y), there is a set of five Huffman codes associated with
+For any given pixel (x, y), there is a set of five prefix codes associated with
 it. These codes are (in bitstream order):

-  * **Huffman code #1**: used for green channel, backward-reference length and
+  * **prefix code #1**: used for green channel, backward-reference length and
    color cache
-  * **Huffman code #2, #3 and #4**: used for red, blue and alpha channels
+  * **prefix code #2, #3 and #4**: used for red, blue and alpha channels
    respectively.
-  * **Huffman code #5**: used for backward-reference distance.
+  * **prefix code #5**: used for backward-reference distance.

-From here on, we refer to this set as a **Huffman code group**.
+From here on, we refer to this set as a **prefix code group**.

-The number of Huffman code groups in the ARGB image can be obtained by finding
-the _largest meta Huffman code_ from the entropy image:
+The number of prefix code groups in the ARGB image can be obtained by finding
+the _largest meta prefix code_ from the entropy image:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int num_huff_groups = max(entropy image) + 1;
+int num_prefix_groups = max(entropy image) + 1;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-where `max(entropy image)` indicates the largest Huffman code stored in the
+where `max(entropy image)` indicates the largest prefix code stored in the
 entropy image.

-As each Huffman code groups contains five Huffman codes, the total number of
-Huffman codes is:
+As each prefix code group contains five prefix codes, the total number of
+prefix codes is:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int num_huff_codes = 5 * num_huff_groups;
+int num_prefix_codes = 5 * num_prefix_groups;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Given a pixel (x, y) in the ARGB image, we can obtain the corresponding Huffman
+Given a pixel (x, y) in the ARGB image, we can obtain the corresponding prefix
 codes to be used as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int position = (y >> huffman_bits) * huffman_xsize + (x >> huffman_bits);
-int meta_huff_code = (entropy_image[pos] >> 8) & 0xffff;
-HuffmanCodeGroup huff_group = huffman_code_groups[meta_huff_code];
+int position = (y >> prefix_bits) * prefix_xsize + (x >> prefix_bits);
+int meta_prefix_code = (entropy_image[pos] >> 8) & 0xffff;
+PrefixCodeGroup prefix_group = prefix_code_groups[meta_prefix_code];
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-where, we have assumed the existence of `HuffmanCodeGroup` structure, which
-represents a set of five Huffman codes. Also, `huffman_code_groups` is an array
-of `HuffmanCodeGroup` (of size `num_huff_groups`).
+where, we have assumed the existence of `PrefixCodeGroup` structure, which
+represents a set of five prefix codes. Also, `prefix_code_groups` is an array
+of `PrefixCodeGroup` (of size `num_prefix_groups`).

-The decoder then uses Huffman code group `huff_group` to decode the pixel
+The decoder then uses prefix code group `prefix_group` to decode the pixel
 (x, y) as explained in the [next section](#decoding-entropy-coded-image-data).

-#### 5.2.2 Decoding Entropy-coded Image Data
+#### 5.2.3 Decoding Entropy-coded Image Data
+
+\[AMENDED2\]

 For the current position (x, y) in the image, the decoder first identifies the
-corresponding Huffman code group (as explained in the last section). Given the
-Huffman code group, the pixel is read and decoded as follows:
+corresponding prefix code group (as explained in the last section). Given the
+prefix code group, the pixel is read and decoded as follows:

-Read next symbol S from the bitstream using Huffman code #1. \[See
-[next section](#decoding-the-code-lengths) for details on decoding the Huffman
-code lengths\]. Note that S is any integer in the range `0` to
+Read next symbol S from the bitstream using prefix code #1. Note that S is any
+integer in the range `0` to
 `(256 + 24 + ` [`color_cache_size`](#color-cache-code)` - 1)`.

 The interpretation of S depends on its value:

  1. if S < 256
-     1. Use S as the green component
-     1. Read red from the bitstream using Huffman code #2
-     1. Read blue from the bitstream using Huffman code #3
-     1. Read alpha from the bitstream using Huffman code #4
-  1. if S < 256 + 24
-     1. Use S - 256 as a length prefix code
-     1. Read extra bits for length from the bitstream
+     1. Use S as the green component.
+     1. Read red from the bitstream using prefix code #2.
+     1. Read blue from the bitstream using prefix code #3.
+     1. Read alpha from the bitstream using prefix code #4.
+  1. if S >= 256 && S < 256 + 24
+     1. Use S - 256 as a length prefix code.
+     1. Read extra bits for length from the bitstream.
     1. Determine backward-reference length L from length prefix code and the
        extra bits read.
-     1. Read distance prefix code from the bitstream using Huffman code #5
-     1. Read extra bits for distance from the bitstream
+     1. Read distance prefix code from the bitstream using prefix code #5.
+     1. Read extra bits for distance from the bitstream.
     1. Determine backward-reference distance D from distance prefix code and
        the extra bits read.
     1. Copy the L pixels (in scan-line order) from the sequence of pixels
@ -947,80 +1071,6 @@ The interpretation of S depends on its value:
     1. Get ARGB color from the color cache at that index.


-**Decoding the Code Lengths:**
-{:#decoding-the-code-lengths}
-
-This section describes the details about reading a symbol from the bitstream by
-decoding the Huffman code length.
-
-The Huffman code lengths can be coded in two ways. The method used is specified
-by a 1-bit value.
-
-  * If this bit is 1, it is a _simple code length code_, and
-  * If this bit is 0, it is a _normal code length code_.
-
-**(i) Simple Code Length Code:**
-
-This variant is used in the special case when only 1 or 2 Huffman code lengths
-are non-zero, and are in the range of \[0, 255\]. All other Huffman code lengths
-are implicitly zeros.
-
-The first bit indicates the number of non-zero code lengths:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int num_code_lengths = ReadBits(1) + 1;
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The first code length is stored either using a 1-bit code for values of 0 and 1,
-or using an 8-bit code for values in range \[0, 255\]. The second code length,
-when present, is coded as an 8-bit code.
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int is_first_8bits = ReadBits(1);
-code_lengths[0] = ReadBits(1 + 7 * is_first_8bits);
-if (num_code_lengths == 2) {
-  code_lengths[1] = ReadBits(8);
-}
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**Note:** Another special case is when _all_ Huffman code lengths are _zeros_
-(an empty Huffman code). For example, a Huffman code for distance can be empty
-if there are no backward references. Similarly, Huffman codes for alpha, red,
-and blue can be empty if all pixels within the same meta Huffman code are
-produced using the color cache. However, this case doesn't need a special
-handling, as empty Huffman codes can be coded as those containing a single
-symbol `0`.
-
-**(ii) Normal Code Length Code:**
-
-The code lengths of a Huffman code are read as follows: `num_code_lengths`
-specifies the number of code lengths; the rest of the code lengths
-(according to the order in `kCodeLengthCodeOrder`) are zeros.
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int kCodeLengthCodes = 19;
-int kCodeLengthCodeOrder[kCodeLengthCodes] = {
-  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-};
-int code_lengths[kCodeLengthCodes] = { 0 };  // All zeros.
-int num_code_lengths = 4 + ReadBits(4);
-for (i = 0; i < num_code_lengths; ++i) {
-  code_lengths[kCodeLengthCodeOrder[i]] = ReadBits(3);
-}
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-  * Code length code \[0..15\] indicates literal code lengths.
-    * Value 0 means no symbols have been coded.
-    * Values \[1..15\] indicate the bit length of the respective code.
-  * Code 16 repeats the previous non-zero value \[3..6\] times, i.e.,
-    3 + `ReadBits(2)` times.  If code 16 is used before a non-zero
-    value has been emitted, a value of 8 is repeated.
-  * Code 17 emits a streak of zeros \[3..10\], i.e., 3 + `ReadBits(3)`
-    times.
-  * Code 18 emits a streak of zeros of length \[11..138\], i.e.,
-    11 + `ReadBits(7)` times.
-
-
 6 Overall Structure of the Format
 ---------------------------------

@ -1056,23 +1106,26 @@ of pixels (xsize * ysize).

 #### Structure of the Image Data

+\[AMENDED2\]
+
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-<spatially-coded image> ::= <meta huffman><entropy-coded image>
-<entropy-coded image> ::= <color cache info><huffman codes><lz77-coded image>
-<meta huffman> ::= 1-bit value 0 |
-                   (1-bit value 1; <entropy image>)
-<entropy image> ::= 3-bit subsample value; <entropy-coded image>
+<spatially-coded image> ::= <color cache info><meta prefix><data>
+<entropy-coded image> ::= <color cache info><data>
 <color cache info> ::= 1 bit value 0 |
                       (1-bit value 1; 4-bit value for color cache size)
-<huffman codes> ::= <huffman code group> | <huffman code group><huffman codes>
-<huffman code group> ::= <huffman code><huffman code><huffman code>
-                         <huffman code><huffman code>
-                         See "Interpretation of Meta Huffman codes" to
-                         understand what each of these five Huffman codes are
+<meta prefix> ::= 1-bit value 0 |
+                  (1-bit value 1; <entropy image>)
+<data> ::= <prefix codes><lz77-coded image>
+<entropy image> ::= 3-bit subsample value; <entropy-coded image>
+<prefix codes> ::= <prefix code group> | <prefix code group><prefix codes>
+<prefix code group> ::= <prefix code><prefix code><prefix code>
+                        <prefix code><prefix code>
+                        See "Interpretation of Meta Prefix Codes" to
+                        understand what each of these five prefix codes are
                        for.
-<huffman code> ::= <simple huffman code> | <normal huffman code>
-<simple huffman code> ::= see "Simple code length code" for details
-<normal huffman code> ::= <code length code>; encoded code lengths
+<prefix code> ::= <simple prefix code> | <normal prefix code>
+<simple prefix code> ::= see "Simple code length code" for details
+<normal prefix code> ::= <code length code>; encoded code lengths
 <code length code> ::= see section "Normal code length code"
 <lz77-coded image> ::= ((<argb-pixel> | <lz77-copy> | <color-cache-code>)
                       <lz77-coded image>) | ""
@ -1082,9 +1135,8 @@ A possible example sequence:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 <RIFF header><image size>1-bit value 1<subtract-green-tx>
-1-bit value 1<predictor-tx>1-bit value 0<meta huffman>
-<color cache info><huffman codes>
-<lz77-coded image>
+1-bit value 1<predictor-tx>1-bit value 0<color cache info>1-bit value 0
+<prefix codes><lz77-coded image>
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 [canonical_huff]: https://en.wikipedia.org/wiki/Canonical_Huffman_code
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@ -241,7 +241,7 @@ static int ReadAnimatedWebP(const char filename[],
  image->bgcolor = anim_info.bgcolor;

  // Allocate frames.
-  if (!AllocateFrames(image, anim_info.frame_count)) return 0;
+  if (!AllocateFrames(image, anim_info.frame_count)) goto End;

  // Decode frames.
  while (WebPAnimDecoderHasMoreFrames(dec)) {
@ -558,7 +558,10 @@ static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
    }
  }
  // Allocate frames.
-  AllocateFrames(image, frame_count);
+  if (!AllocateFrames(image, frame_count)) {
+    DGifCloseFile(gif, NULL);
+    return 0;
+  }

  canvas_width = image->canvas_width;
  canvas_height = image->canvas_height;
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -571,7 +571,7 @@ static void HelpLong(void) {
  printf("  -qrange <min> <max> .... specifies the permissible quality range\n"
         "                           (default: 0 100)\n");
  printf("  -crop <x> <y> <w> <h> .. crop picture with the given rectangle\n");
-  printf("  -resize <w> <h> ........ resize picture (after any cropping)\n");
+  printf("  -resize <w> <h> ........ resize picture (*after* any cropping)\n");
  printf("  -mt .................... use multi-threading if available\n");
  printf("  -low_memory ............ reduce memory usage (slower encoding)\n");
  printf("  -map <int> ............. print map of extra info\n");
@ -620,6 +620,7 @@ static void HelpLong(void) {
  printf("  -af .................... auto-adjust filter strength\n");
  printf("  -pre <int> ............. pre-processing filter\n");
  printf("\n");
+  printf("Supported input formats:\n  %s\n", WebPGetEnabledInputFileFormats());
 }

 //------------------------------------------------------------------------------
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -96,7 +96,7 @@ static void Help(void) {
         "  -alpha_dither  use alpha-plane dithering if needed\n"
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
-         "  -resize <w> <h> ......... scale the output (*after* any cropping)\n"
+         "  -resize <w> <h> ......... resize output (*after* any cropping)\n"
         "  -flip ........ flip the output vertically\n"
         "  -alpha ....... only save the alpha plane\n"
         "  -incremental . use incremental decoding (useful for tests)\n"
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -103,7 +103,10 @@ int ExUtilInitCommandLineArguments(int argc, const char* argv[],
    }
    args->own_argv_ = 1;
    args->argv_ = (const char**)WebPMalloc(MAX_ARGC * sizeof(*args->argv_));
-    if (args->argv_ == NULL) return 0;
+    if (args->argv_ == NULL) {
+      ExUtilDeleteCommandLineArguments(args);
+      return 0;
+    }

    argc = 0;
    for (cur = strtok((char*)args->argv_data_.bytes, sep);
@ -111,6 +114,7 @@ int ExUtilInitCommandLineArguments(int argc, const char* argv[],
         cur = strtok(NULL, sep)) {
      if (argc == MAX_ARGC) {
        fprintf(stderr, "ERROR: Arguments limit %d reached\n", MAX_ARGC);
+        ExUtilDeleteCommandLineArguments(args);
        return 0;
      }
      assert(strlen(cur) != 0);
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -314,8 +314,11 @@ int main(int argc, const char* argv[]) {
          frame.use_argb = 1;
          if (!WebPPictureAlloc(&frame)) goto End;
          GIFClearPic(&frame, NULL);
-          WebPPictureCopy(&frame, &curr_canvas);
-          WebPPictureCopy(&frame, &prev_canvas);
+          if (!(WebPPictureCopy(&frame, &curr_canvas) &&
+                WebPPictureCopy(&frame, &prev_canvas))) {
+            fprintf(stderr, "Error allocating canvas.\n");
+            goto End;
+          }

          // Background color.
          GIFGetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
--- a/examples/img2webp.c
+++ b/examples/img2webp.c
@ -35,8 +35,7 @@

 static void Help(void) {
  printf("Usage:\n\n");
-  printf("  img2webp [file-level options] [image files...] "
-         "[per-frame options...]\n");
+  printf("  img2webp [file_options] [[frame_options] frame_file]...\n");
  printf("\n");

  printf("File-level options (only used at the start of compression):\n");
@ -66,6 +65,8 @@ static void Help(void) {
         "arguments will be\n");
  printf("tokenized from this file. The file name must not start with "
         "the character '-'.\n");
+  printf("\nSupported input formats:\n  %s\n",
+         WebPGetEnabledInputFileFormats());
 }

 //------------------------------------------------------------------------------
@ -187,7 +188,7 @@ int main(int argc, const char* argv[]) {
        verbose = 1;
      } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
        Help();
-        goto End;
+        FREE_WARGV_AND_RETURN(0);
      } else if (!strcmp(argv[c], "-version")) {
        const int enc_version = WebPGetEncoderVersion();
        const int mux_version = WebPGetMuxVersion();
--- a/examples/unicode.h
+++ b/examples/unicode.h
@ -16,11 +16,15 @@
 #ifndef WEBP_EXAMPLES_UNICODE_H_
 #define WEBP_EXAMPLES_UNICODE_H_

+#include <stdio.h>
+
 #if defined(_WIN32) && defined(_UNICODE)

 // wchar_t is used instead of TCHAR because we only perform additional work when
 // Unicode is enabled and because the output of CommandLineToArgvW() is wchar_t.

+#include <fcntl.h>
+#include <io.h>
 #include <wchar.h>
 #include <windows.h>
 #include <shellapi.h>
@ -55,8 +59,16 @@

 #define WFOPEN(ARG, OPT) _wfopen((const W_CHAR*)ARG, TO_W_CHAR(OPT))

-#define WPRINTF(STR, ...) wprintf(TO_W_CHAR(STR), __VA_ARGS__)
-#define WFPRINTF(STDERR, STR, ...) fwprintf(STDERR, TO_W_CHAR(STR), __VA_ARGS__)
+#define WFPRINTF(STREAM, STR, ...)                    \
+  do {                                                \
+    int prev_mode;                                    \
+    fflush(STREAM);                                   \
+    prev_mode = _setmode(_fileno(STREAM), _O_U8TEXT); \
+    fwprintf(STREAM, TO_W_CHAR(STR), __VA_ARGS__);    \
+    fflush(STREAM);                                   \
+    (void)_setmode(_fileno(STREAM), prev_mode);       \
+  } while (0)
+#define WPRINTF(STR, ...) WFPRINTF(stdout, STR, __VA_ARGS__)

 #define WSTRLEN(FILENAME) wcslen((const W_CHAR*)FILENAME)
 #define WSTRCMP(FILENAME, STR) wcscmp((const W_CHAR*)FILENAME, TO_W_CHAR(STR))
@ -65,6 +77,8 @@

 #else

+#include <string.h>
+
 // Unicode file paths work as is on Unix platforms, and no extra work is done on
 // Windows either if Unicode is disabled.

@ -83,7 +97,7 @@
 #define WFOPEN(ARG, OPT) fopen(ARG, OPT)

 #define WPRINTF(STR, ...) printf(STR, __VA_ARGS__)
-#define WFPRINTF(STDERR, STR, ...) fprintf(STDERR, STR, __VA_ARGS__)
+#define WFPRINTF(STREAM, STR, ...) fprintf(STREAM, STR, __VA_ARGS__)

 #define WSTRLEN(FILENAME) strlen(FILENAME)
 #define WSTRCMP(FILENAME, STR) strcmp(FILENAME, STR)
--- a/examples/unicode_gif.h
+++ b/examples/unicode_gif.h
@ -45,7 +45,7 @@ static GifFileType* DGifOpenFileUnicode(const W_CHAR* file_name, int* error) {
  }

 #if defined(_WIN32) && defined(_UNICODE)
-
+  {
    int file_handle = _wopen(file_name, _O_RDONLY | _O_BINARY);
    if (file_handle == -1) {
      if (error != NULL) *error = D_GIF_ERR_OPEN_FAILED;
@ -57,6 +57,7 @@ static GifFileType* DGifOpenFileUnicode(const W_CHAR* file_name, int* error) {
 #else
    return DGifOpenFileHandle(file_handle);
 #endif
+  }

 #else

--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -292,6 +292,19 @@ static void PrintString(const char* const text) {
  }
 }

+static void PrintStringW(const char* const text) {
+#if defined(_WIN32) && defined(_UNICODE)
+  void* const font = GLUT_BITMAP_9_BY_15;
+  const W_CHAR* const wtext = (const W_CHAR*)text;
+  int i;
+  for (i = 0; wtext[i]; ++i) {
+    glutBitmapCharacter(font, wtext[i]);
+  }
+#else
+  PrintString(text);
+#endif
+}
+
 static float GetColorf(uint32_t color, int shift) {
  return ((color >> shift) & 0xff) / 255.f;
 }
@ -396,7 +409,7 @@ static void HandleDisplay(void) {

    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
    glRasterPos2f(-0.95f, 0.90f);
-    PrintString(kParams.file_name);
+    PrintStringW(kParams.file_name);

    snprintf(tmp, sizeof(tmp), "Dimension:%d x %d", pic->width, pic->height);
    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
@ -125,16 +125,16 @@ static void WebPInfoInit(WebPInfo* const webp_info) {
  memset(webp_info, 0, sizeof(*webp_info));
 }

-static const char kWebPChunkTags[CHUNK_TYPES][4] = {
-  { 'V', 'P', '8', ' ' },
-  { 'V', 'P', '8', 'L' },
-  { 'V', 'P', '8', 'X' },
-  { 'A', 'L', 'P', 'H' },
-  { 'A', 'N', 'I', 'M' },
-  { 'A', 'N', 'M', 'F' },
-  { 'I', 'C', 'C', 'P' },
-  { 'E', 'X', 'I', 'F' },
-  { 'X', 'M', 'P', ' ' },
+static const uint32_t kWebPChunkTags[CHUNK_TYPES] = {
+  MKFOURCC('V', 'P', '8', ' '),
+  MKFOURCC('V', 'P', '8', 'L'),
+  MKFOURCC('V', 'P', '8', 'X'),
+  MKFOURCC('A', 'L', 'P', 'H'),
+  MKFOURCC('A', 'N', 'I', 'M'),
+  MKFOURCC('A', 'N', 'M', 'F'),
+  MKFOURCC('I', 'C', 'C', 'P'),
+  MKFOURCC('E', 'X', 'I', 'F'),
+  MKFOURCC('X', 'M', 'P', ' '),
 };

 // -----------------------------------------------------------------------------
@ -644,7 +644,7 @@ static WebPInfoStatus ParseChunk(const WebPInfo* const webp_info,
      return WEBP_INFO_TRUNCATED_DATA;
    }
    for (i = 0; i < CHUNK_TYPES; ++i) {
-      if (!memcmp(kWebPChunkTags[i], &fourcc, TAG_SIZE)) break;
+      if (kWebPChunkTags[i] == fourcc) break;
    }
    chunk_data->offset_ = chunk_start_offset;
    chunk_data->size_ = chunk_size;
@ -939,7 +939,13 @@ static WebPInfoStatus ProcessChunk(const ChunkData* const chunk_data,
    LOG_WARN(error_message);
  } else {
    if (!webp_info->quiet_) {
-      const char* tag = kWebPChunkTags[chunk_data->id_];
+      char tag[4];
+      uint32_t fourcc = kWebPChunkTags[chunk_data->id_];
+#ifdef WORDS_BIGENDIAN
+      fourcc = (fourcc >> 24) | ((fourcc >> 8) & 0xff00) |
+               ((fourcc << 8) & 0xff0000) | (fourcc << 24);
+#endif
+      memcpy(tag, &fourcc, sizeof(tag));
      printf("Chunk %c%c%c%c at offset %6d, length %6d\n",
             tag[0], tag[1], tag[2], tag[3], (int)chunk_data->offset_,
             (int)chunk_data->size_);
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -329,7 +329,7 @@ static void PrintHelp(void) {
  printf("\n");
  printf("DURATION_OPTIONS:\n");
  printf(" Set duration of selected frames:\n");
-  printf("   duration            set duration for each frames\n");
+  printf("   duration            set duration for all frames\n");
  printf("   duration,frame      set duration of a particular frame\n");
  printf("   duration,start,end  set duration of frames in the\n");
  printf("                        interval [start,end])\n");
@ -348,7 +348,7 @@ static void PrintHelp(void) {
  printf("\n");
  printf("FRAME_OPTIONS(i):\n");
  printf(" Create animation:\n");
-  printf("   file_i +di+[xi+yi[+mi[bi]]]\n");
+  printf("   file_i +di[+xi+yi[+mi[bi]]]\n");
  printf("   where:    'file_i' is the i'th animation frame (WebP format),\n");
  printf("             'di' is the pause duration before next frame,\n");
  printf("             'xi','yi' specify the image offset for this frame,\n");
@ -460,7 +460,8 @@ static WebPMux* DuplicateMuxHeader(const WebPMux* const mux) {
    if (err == WEBP_MUX_OK && metadata.size > 0) {
      err = WebPMuxSetChunk(new_mux, kFourccList[i], &metadata, 1);
      if (err != WEBP_MUX_OK) {
-        ERROR_GOTO1("Error transferring metadata in DuplicateMux().", End);
+        ERROR_GOTO1("Error transferring metadata in DuplicateMuxHeader().",
+                    End);
      }
    }
  }
@ -684,7 +685,7 @@ static int ParseCommandLine(Config* config, const W_CHAR** const unicode_argv) {
          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
        }
        arg->subtype_ = SUBTYPE_ANMF;
-        arg->filename_ = argv[i + 1];
+        arg->filename_ = wargv[i + 1];
        arg->params_ = argv[i + 2];
        ++feature_arg_index;
        i += 3;
--- a/extras/extras.c
+++ b/extras/extras.c
@ -19,7 +19,7 @@

 #define XTRA_MAJ_VERSION 1
 #define XTRA_MIN_VERSION 2
-#define XTRA_REV_VERSION 2
+#define XTRA_REV_VERSION 3

 //------------------------------------------------------------------------------

--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@ -223,7 +223,8 @@ static void Help(void) {
          "  -o <file> . save the diff map as a WebP lossless file\n"
          "  -scale .... scale the difference map to fit [0..255] range\n"
          "  -gray ..... use grayscale for difference map (-scale)\n"
-          " Also handles PNG, JPG and TIFF files, in addition to WebP.\n");
+          "\nSupported input formats:\n  %s\n",
+          WebPGetEnabledInputFileFormats());
 }

 int main(int argc, const char* argv[]) {
--- a/imageio/image_dec.c
+++ b/imageio/image_dec.c
@ -11,6 +11,24 @@

 #include "./image_dec.h"

+const char* WebPGetEnabledInputFileFormats(void) {
+  return "WebP"
+#ifdef WEBP_HAVE_JPEG
+         ", JPEG"
+#endif
+#ifdef WEBP_HAVE_PNG
+         ", PNG"
+#endif
+         ", PNM (PGM, PPM, PAM)"
+#ifdef WEBP_HAVE_TIFF
+         ", TIFF"
+#endif
+#ifdef HAVE_WINCODEC_H
+         ", Windows Imaging Component (WIC)"
+#endif
+         "";
+}
+
 static WEBP_INLINE uint32_t GetBE32(const uint8_t buf[]) {
  return ((uint32_t)buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
 }
--- a/imageio/image_dec.h
+++ b/imageio/image_dec.h
@ -41,6 +41,9 @@ typedef enum {
  WEBP_UNSUPPORTED_FORMAT
 } WebPInputFileFormat;

+// Returns a comma separated list of enabled input formats.
+const char* WebPGetEnabledInputFileFormats(void);
+
 // Try to infer the image format. 'data_size' should be larger than 12.
 // Returns WEBP_UNSUPPORTED_FORMAT if format can't be guess safely.
 WebPInputFileFormat WebPGuessImageType(const uint8_t* const data,
--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@ -280,7 +280,7 @@ int WebPWrite16bAsPGM(FILE* fout, const WebPDecBuffer* const buffer) {
 }

 //------------------------------------------------------------------------------
-// BMP
+// BMP (see https://en.wikipedia.org/wiki/BMP_file_format#Pixel_storage)

 static void PutLE16(uint8_t* const dst, uint32_t value) {
  dst[0] = (value >> 0) & 0xff;
@ -293,8 +293,11 @@ static void PutLE32(uint8_t* const dst, uint32_t value) {
 }

 #define BMP_HEADER_SIZE 54
+#define BMP_HEADER_ALPHA_EXTRA_SIZE 16  // for alpha info
 int WebPWriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
+  const int header_size =
+      BMP_HEADER_SIZE + (has_alpha ? BMP_HEADER_ALPHA_EXTRA_SIZE : 0);
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
  const uint8_t* rgba = buffer->u.RGBA.rgba;
@ -303,8 +306,9 @@ int WebPWriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
  uint32_t y;
  const uint32_t line_size = bytes_per_px * width;
  const uint32_t bmp_stride = (line_size + 3) & ~3;   // pad to 4
-  const uint32_t total_size = bmp_stride * height + BMP_HEADER_SIZE;
-  uint8_t bmp_header[BMP_HEADER_SIZE] = { 0 };
+  const uint32_t image_size = bmp_stride * height;
+  const uint32_t total_size =  image_size + header_size;
+  uint8_t bmp_header[BMP_HEADER_SIZE + BMP_HEADER_ALPHA_EXTRA_SIZE] = { 0 };

  if (fout == NULL || buffer == NULL || rgba == NULL) return 0;

@ -312,30 +316,37 @@ int WebPWriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
  PutLE16(bmp_header + 0, 0x4d42);                // signature 'BM'
  PutLE32(bmp_header + 2, total_size);            // size including header
  PutLE32(bmp_header + 6, 0);                     // reserved
-  PutLE32(bmp_header + 10, BMP_HEADER_SIZE);      // offset to pixel array
+  PutLE32(bmp_header + 10, header_size);          // offset to pixel array
  // bitmap info header
-  PutLE32(bmp_header + 14, 40);                   // DIB header size
+  PutLE32(bmp_header + 14, header_size - 14);     // DIB header size
  PutLE32(bmp_header + 18, width);                // dimensions
-  PutLE32(bmp_header + 22, -(int)height);         // vertical flip!
+  PutLE32(bmp_header + 22, height);               // no vertical flip
  PutLE16(bmp_header + 26, 1);                    // number of planes
  PutLE16(bmp_header + 28, bytes_per_px * 8);     // bits per pixel
-  PutLE32(bmp_header + 30, 0);                    // no compression (BI_RGB)
-  PutLE32(bmp_header + 34, 0);                    // image size (placeholder)
+  PutLE32(bmp_header + 30, has_alpha ? 3 : 0);    // BI_BITFIELDS or BI_RGB
+  PutLE32(bmp_header + 34, image_size);
  PutLE32(bmp_header + 38, 2400);                 // x pixels/meter
  PutLE32(bmp_header + 42, 2400);                 // y pixels/meter
  PutLE32(bmp_header + 46, 0);                    // number of palette colors
  PutLE32(bmp_header + 50, 0);                    // important color count
+  if (has_alpha) {  // BITMAPV3INFOHEADER complement
+    PutLE32(bmp_header + 54, 0x00ff0000);         // red mask
+    PutLE32(bmp_header + 58, 0x0000ff00);         // green mask
+    PutLE32(bmp_header + 62, 0x000000ff);         // blue mask
+    PutLE32(bmp_header + 66, 0xff000000);         // alpha mask
+  }

  // TODO(skal): color profile

  // write header
-  if (fwrite(bmp_header, sizeof(bmp_header), 1, fout) != 1) {
+  if (fwrite(bmp_header, header_size, 1, fout) != 1) {
    return 0;
  }

-  // write pixel array
+  // write pixel array, bottom to top
  for (y = 0; y < height; ++y) {
-    if (fwrite(rgba, line_size, 1, fout) != 1) {
+    const uint8_t* const src = &rgba[(uint64_t)(height - 1 - y) * stride];
+    if (fwrite(src, line_size, 1, fout) != 1) {
      return 0;
    }
    // write padding zeroes
@ -345,11 +356,11 @@ int WebPWriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
        return 0;
      }
    }
-    rgba += stride;
  }
  return 1;
 }
 #undef BMP_HEADER_SIZE
+#undef BMP_HEADER_ALPHA_EXTRA_SIZE

 //------------------------------------------------------------------------------
 // TIFF
--- a/imageio/jpegdec.c
+++ b/imageio/jpegdec.c
@ -336,7 +336,11 @@ int ReadJPEG(const uint8_t* const data, size_t data_size,
  pic->width = width;
  pic->height = height;
  ok = WebPPictureImportRGB(pic, rgb, (int)stride);
-  if (!ok) goto Error;
+  if (!ok) {
+    pic->width = 0;   // WebPPictureImportRGB() barely touches 'pic' on failure.
+    pic->height = 0;  // Just reset dimensions but keep any 'custom_ptr' etc.
+    MetadataFree(metadata);  // In case the caller forgets to free it on error.
+  }

 End:
  free(rgb);
--- a/imageio/tiffdec.c
+++ b/imageio/tiffdec.c
@ -188,7 +188,9 @@ int ReadTIFF(const uint8_t* const data, size_t data_size,
    fprintf(stderr, "Error! Cannot retrieve TIFF samples-per-pixel info.\n");
    goto End;
  }
-  if (samples_per_px < 3 || samples_per_px > 4) goto End;  // not supported
+  if (!(samples_per_px == 1 || samples_per_px == 3 || samples_per_px == 4)) {
+    goto End;  // not supported
+  }

  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &image_width) &&
        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &image_height))) {
@ -212,7 +214,7 @@ int ReadTIFF(const uint8_t* const data, size_t data_size,
      TIFFGetField(tif, TIFFTAG_TILELENGTH, &tile_height)) {
    if ((tile_width > 32 && tile_width / 2 > image_width) ||
        (tile_height > 32 && tile_height / 2 > image_height) ||
-        ImgIoUtilCheckSizeArgumentsOverflow(
+        !ImgIoUtilCheckSizeArgumentsOverflow(
            (uint64_t)tile_width * sizeof(*raster), tile_height)) {
      fprintf(stderr, "Error! TIFF tile dimension (%d x %d) is too large.\n",
              tile_width, tile_height);
--- a/infra/run_static_analysis.sh
+++ b/infra/run_static_analysis.sh
@ -85,10 +85,10 @@ scan_build make -j4

 index="$(find "${OUTPUT_DIR}" -name index.html)"
 if [[ -f "${index}" ]]; then
-  mv "$(dirname "${index}")/"* .
+  mv "$(dirname "${index}")/"* "${OUTPUT_DIR}"
 else
  # make a empty report to wipe out any old bug reports.
-  cat << EOT > index.html
+  cat << EOT > "${OUTPUT_DIR}/index.html"
 <html>
 <body>
 No bugs reported.
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -133,10 +133,9 @@ for PLATFORM in ${PLATFORMS}; do
    CFLAGS="${CFLAGS}"
  set +x

-  # run make only in the src/ directory to create libwebp.a/libwebpdecoder.a
-  cd src/
-  make V=0
-  make install
+  # Build only the libraries, skip the examples.
+  make V=0 -C sharpyuv
+  make V=0 -C src install

  LIBLIST+=" ${ROOTDIR}/lib/libwebp.a"
  DECLIBLIST+=" ${ROOTDIR}/lib/libwebpdecoder.a"
@ -144,7 +143,6 @@ for PLATFORM in ${PLATFORMS}; do
  DEMUXLIBLIST+=" ${ROOTDIR}/lib/libwebpdemux.a"

  make clean
-  cd ..

  export PATH=${OLDPATH}
 done
--- a/makefile.unix
+++ b/makefile.unix
@ -125,6 +125,14 @@ endif
 ANIM_UTIL_OBJS = \
    examples/anim_util.o \

+SHARPYUV_OBJS = \
+    sharpyuv/sharpyuv.o \
+    sharpyuv/sharpyuv_csp.o \
+    sharpyuv/sharpyuv_dsp.o \
+    sharpyuv/sharpyuv_gamma.o \
+    sharpyuv/sharpyuv_neon.o \
+    sharpyuv/sharpyuv_sse2.o \
+
 DEC_OBJS = \
    src/dec/alpha_dec.o \
    src/dec/buffer_dec.o \
@ -282,8 +290,8 @@ EXTRA_OBJS = \
    extras/quality_estimate.o \

 LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
-LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
-               $(UTILS_ENC_OBJS)
+LIBWEBP_OBJS = $(SHARPYUV_OBJS) $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) \
+               $(DSP_ENC_OBJS) $(UTILS_ENC_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS)
 LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)
 LIBWEBPEXTRA_OBJS = $(EXTRA_OBJS)
@ -304,6 +312,7 @@ HDRS = \
    src/dec/vp8li_dec.h \
    src/dec/webpi_dec.h \
    src/dsp/common_sse2.h \
+    src/dsp/cpu.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
    src/dsp/lossless_common.h \
@ -489,6 +498,7 @@ clean:
              examples/*.o examples/*~ \
              extras/*.o extras/*~ \
              imageio/*.o imageio/*~ \
+              sharpyuv/*.o sharpyuv/*~ \
              src/dec/*.o src/dec/*~ \
              src/demux/*.o src/demux/*~ \
              src/dsp/*.o src/dsp/*~ \
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "November 17, 2021"
+.TH CWEBP 1 "March 17, 2022"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -90,15 +90,17 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result in faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
-.BI \-resize " width height
-Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
-If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
-the value will be calculated preserving the aspect\-ratio.
-.TP
 .BI \-crop " x_position y_position width height
 Crop the source to a rectangle with top\-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
 This cropping area must be fully contained within the source rectangle.
+Note: the cropping is applied \fIbefore\fP any scaling.
+.TP
+.BI \-resize " width height
+Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect\-ratio. Note: scaling
+is applied \fIafter\fP cropping.
 .TP
 .B \-mt
 Use multi\-threading for encoding, if possible.
--- a/man/img2webp.1
+++ b/man/img2webp.1
@ -1,10 +1,10 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH IMG2WEBP 1 "November 17, 2021"
+.TH IMG2WEBP 1 "January 5, 2022"
 .SH NAME
 img2webp \- create animated WebP file from a sequence of input images.
 .SH SYNOPSIS
 .B img2webp
-[file_level_options] [files] [per_frame_options...]
+[file_options] [[frame_options] frame_file]...
 .br
 .B img2webp argument_file_name
 .br
--- a/sharpyuv/Makefile.am
+++ b/sharpyuv/Makefile.am
@ -0,0 +1,34 @@
+AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
+AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
+noinst_LTLIBRARIES =
+noinst_LTLIBRARIES += libsharpyuv.la
+noinst_LTLIBRARIES += libsharpyuv_sse2.la
+noinst_LTLIBRARIES += libsharpyuv_neon.la
+
+noinst_HEADERS =
+noinst_HEADERS += ../src/webp/types.h
+noinst_HEADERS += ../src/dsp/cpu.h
+
+libsharpyuv_sse2_la_SOURCES =
+libsharpyuv_sse2_la_SOURCES += sharpyuv_sse2.c
+libsharpyuv_sse2_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS)
+libsharpyuv_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
+
+libsharpyuv_neon_la_SOURCES =
+libsharpyuv_neon_la_SOURCES += sharpyuv_neon.c
+libsharpyuv_neon_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS)
+libsharpyuv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
+
+libsharpyuv_la_SOURCES =
+libsharpyuv_la_SOURCES += sharpyuv_csp.c sharpyuv_csp.h
+libsharpyuv_la_SOURCES += sharpyuv_dsp.c sharpyuv_dsp.h
+libsharpyuv_la_SOURCES += sharpyuv_gamma.c sharpyuv_gamma.h
+libsharpyuv_la_SOURCES += sharpyuv.c sharpyuv.h
+
+libsharpyuv_la_CPPFLAGS = $(AM_CPPFLAGS)
+libsharpyuv_la_LDFLAGS =
+libsharpyuv_la_LIBADD =
+libsharpyuv_la_LIBADD += libsharpyuv_sse2.la
+libsharpyuv_la_LIBADD += libsharpyuv_neon.la
+
+noinst_PROGRAMS =
--- a/sharpyuv/sharpyuv.c
+++ b/sharpyuv/sharpyuv.c
@ -0,0 +1,498 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "src/webp/types.h"
+#include "src/dsp/cpu.h"
+#include "sharpyuv/sharpyuv_dsp.h"
+#include "sharpyuv/sharpyuv_gamma.h"
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+
+#define YUV_FIX 16  // fixed-point precision for RGB->YUV
+static const int kYuvHalf = 1 << (YUV_FIX - 1);
+
+// Max bit depth so that intermediate calculations fit in 16 bits.
+static const int kMaxBitDepth = 14;
+
+// Returns the precision shift to use based on the input rgb_bit_depth.
+static int GetPrecisionShift(int rgb_bit_depth) {
+  // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
+  // bits if needed.
+  return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
+                                               : (kMaxBitDepth - rgb_bit_depth);
+}
+
+typedef int16_t fixed_t;      // signed type with extra precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra precision for W
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static uint16_t clip(fixed_t v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static fixed_y_t clip_bit_depth(int y, int bit_depth) {
+  const int max = (1 << bit_depth) - 1;
+  return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int64_t r, int64_t g, int64_t b) {
+  const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
+  return (int)(luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+                          int rgb_bit_depth) {
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  const uint32_t A = SharpYuvGammaToLinear(a, bit_depth);
+  const uint32_t B = SharpYuvGammaToLinear(b, bit_depth);
+  const uint32_t C = SharpYuvGammaToLinear(c, bit_depth);
+  const uint32_t D = SharpYuvGammaToLinear(d, bit_depth);
+  return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
+                                int rgb_bit_depth) {
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  int i;
+  for (i = 0; i < w; ++i) {
+    const uint32_t R = SharpYuvGammaToLinear(src[0 * w + i], bit_depth);
+    const uint32_t G = SharpYuvGammaToLinear(src[1 * w + i], bit_depth);
+    const uint32_t B = SharpYuvGammaToLinear(src[2 * w + i], bit_depth);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth);
+  }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w, int rgb_bit_depth) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r =
+        ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
+                  src2[0 * uv_w + 1], rgb_bit_depth);
+    const int g =
+        ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
+                  src2[2 * uv_w + 1], rgb_bit_depth);
+    const int b =
+        ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
+                  src2[4 * uv_w + 1], rgb_bit_depth);
+    const int W = RGBToGray(r, g, b);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
+  }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  assert(w > 0);
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_bit_depth(v0 + W0, bit_depth);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Shift(int v, int shift) {
+  return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int rgb_step,
+                         int rgb_bit_depth,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  // Convert the rgb_step from a number of bytes to a number of uint8_t or
+  // uint16_t values depending the bit depth.
+  const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
+  int i;
+  const int w = (pic_width + 1) & ~1;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    const int shift = GetPrecisionShift(rgb_bit_depth);
+    if (rgb_bit_depth == 8) {
+      dst[i + 0 * w] = Shift(r_ptr[off], shift);
+      dst[i + 1 * w] = Shift(g_ptr[off], shift);
+      dst[i + 2 * w] = Shift(b_ptr[off], shift);
+    } else {
+      dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
+      dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
+      dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
+    }
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
+                               int w,
+                               fixed_y_t* out1,
+                               fixed_y_t* out2,
+                               int rgb_bit_depth) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
+
+    SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
+                      bit_depth);
+    SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
+                      bit_depth);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0], bit_depth);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w], bit_depth);
+    }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
+  }
+}
+
+static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
+                                         const int coeffs[4], int sfix) {
+  const int srounder = 1 << (YUV_FIX + sfix - 1);
+  const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
+                   coeffs[3] + srounder;
+  return (luma >> (YUV_FIX + sfix));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+                            uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
+                            int u_stride, uint8_t* v_ptr, int v_stride,
+                            int rgb_bit_depth,
+                            int yuv_bit_depth, int width, int height,
+                            const SharpYuvConversionMatrix* yuv_matrix) {
+  int i, j;
+  const fixed_t* const best_uv_base = best_uv;
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  const int sfix = GetPrecisionShift(rgb_bit_depth);
+  const int yuv_max = (1 << yuv_bit_depth) - 1;
+
+  for (best_uv = best_uv_base, j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
+      if (yuv_bit_depth <= 8) {
+        y_ptr[i] = clip_8b(y);
+      } else {
+        ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
+      }
+    }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    y_ptr += y_stride;
+  }
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+    for (i = 0; i < uv_w; ++i) {
+      const int off = i;
+      // Note r, g and b values here are off by W, but a constant offset on all
+      // 3 components doesn't change the value of u and v with a YCbCr matrix.
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
+      const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
+      const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
+      if (yuv_bit_depth <= 8) {
+        u_ptr[i] = clip_8b(u);
+        v_ptr[i] = clip_8b(v);
+      } else {
+        ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
+        ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
+      }
+    }
+    best_uv += 3 * uv_w;
+    u_ptr += u_stride;
+    v_ptr += v_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+static void* SafeMalloc(uint64_t nmemb, size_t size) {
+  const uint64_t total_size = nmemb * (uint64_t)size;
+  if (total_size != (size_t)total_size) return NULL;
+  return malloc((size_t)total_size);
+}
+
+#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))
+
+static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                            const uint8_t* b_ptr, int rgb_step, int rgb_stride,
+                            int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
+                            uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
+                            int v_stride, int yuv_bit_depth, int width,
+                            int height,
+                            const SharpYuvConversionMatrix* yuv_matrix) {
+  // we expand the right/bottom border if needed
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+  int ok;
+  assert(w > 0);
+  assert(h > 0);
+
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = 0;
+    goto End;
+  }
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < height; j += 2) {
+    const int is_last_row = (j == height - 1);
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
+                 src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+                   rgb_step, rgb_bit_depth, width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w, rgb_bit_depth);
+    UpdateW(src2, target_y + w, w, rgb_bit_depth);
+    UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
+                           src1, src2, rgb_bit_depth);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth);
+      UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth);
+
+      // update two rows of Y and one row of RGB
+      diff_y_sum +=
+          SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
+                          rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
+      SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
+    }
+    // test exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
+                        u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
+                        width, height, yuv_matrix);
+
+ End:
+  free(best_y_base);
+  free(best_uv_base);
+  free(target_y_base);
+  free(target_uv_base);
+  free(best_rgb_y);
+  free(best_rgb_uv);
+  free(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+// Hidden exported init function.
+// By default SharpYuvConvert calls it with NULL. If needed, users can declare
+// it as extern and call it with a VP8CPUInfo function.
+extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
+void SharpYuvInit(VP8CPUInfo cpu_info_func) {
+  static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
+      (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
+  const int initialized =
+      (sharpyuv_last_cpuinfo_used != (VP8CPUInfo)&sharpyuv_last_cpuinfo_used);
+  if (cpu_info_func == NULL && initialized) return;
+  if (sharpyuv_last_cpuinfo_used == cpu_info_func) return;
+
+  SharpYuvInitDsp(cpu_info_func);
+  if (!initialized) {
+    SharpYuvInitGammaTables();
+  }
+
+  sharpyuv_last_cpuinfo_used = cpu_info_func;
+}
+
+int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+                    const void* b_ptr, int rgb_step, int rgb_stride,
+                    int rgb_bit_depth, void* y_ptr, int y_stride,
+                    void* u_ptr, int u_stride, void* v_ptr,
+                    int v_stride, int yuv_bit_depth, int width,
+                    int height, const SharpYuvConversionMatrix* yuv_matrix) {
+  SharpYuvConversionMatrix scaled_matrix;
+  const int rgb_max = (1 << rgb_bit_depth) - 1;
+  const int rgb_round = 1 << (rgb_bit_depth - 1);
+  const int yuv_max = (1 << yuv_bit_depth) - 1;
+  const int sfix = GetPrecisionShift(rgb_bit_depth);
+
+  if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
+      r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
+      u_ptr == NULL || v_ptr == NULL) {
+    return 0;
+  }
+  if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
+      rgb_bit_depth != 16) {
+    return 0;
+  }
+  if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
+    return 0;
+  }
+  if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) {
+    // Step/stride should be even for uint16_t buffers.
+    return 0;
+  }
+  if (yuv_bit_depth > 8 &&
+      (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
+    // Stride should be even for uint16_t buffers.
+    return 0;
+  }
+  SharpYuvInit(NULL);
+
+  // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
+  // rgb->yuv conversion matrix.
+  if (rgb_bit_depth == yuv_bit_depth) {
+    memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
+  } else {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      scaled_matrix.rgb_to_y[i] =
+          (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
+      scaled_matrix.rgb_to_u[i] =
+          (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
+      scaled_matrix.rgb_to_v[i] =
+          (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
+    }
+  }
+  // Also incorporate precision change scaling.
+  scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
+  scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
+  scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
+
+  return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
+                          rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
+                          v_ptr, v_stride, yuv_bit_depth, width, height,
+                          &scaled_matrix);
+}
+
+//------------------------------------------------------------------------------
--- a/sharpyuv/sharpyuv.h
+++ b/sharpyuv/sharpyuv.h
@ -0,0 +1,81 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_H_
+#define WEBP_SHARPYUV_SHARPYUV_H_
+
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// SharpYUV API version following the convention from semver.org
+#define SHARPYUV_VERSION_MAJOR 0
+#define SHARPYUV_VERSION_MINOR 1
+#define SHARPYUV_VERSION_PATCH 0
+// Version as a uint32_t. The major number is the high 8 bits.
+// The minor number is the middle 8 bits. The patch number is the low 16 bits.
+#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
+  (((MAJOR) << 24) | ((MINOR) << 16) | (PATCH))
+#define SHARPYUV_VERSION                                                \
+  SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
+                        SHARPYUV_VERSION_PATCH)
+
+// RGB to YUV conversion matrix, in 16 bit fixed point.
+// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
+// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
+// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
+// Then y, u and v values are divided by 1<<16 and rounded.
+typedef struct {
+  int rgb_to_y[4];
+  int rgb_to_u[4];
+  int rgb_to_v[4];
+} SharpYuvConversionMatrix;
+
+// Converts RGB to YUV420 using a downsampling algorithm that minimizes
+// artefacts caused by chroma subsampling.
+// This is slower than standard downsampling (averaging of 4 UV values).
+// Assumes that the image will be upsampled using a bilinear filter. If nearest
+// neighbor is used instead, the upsampled image might look worse than with
+// standard downsampling.
+// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
+//     to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
+// rgb_step: distance in bytes between two horizontally adjacent pixels on the
+//     r, g and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_stride: distance in bytes between two vertically adjacent pixels on the
+//     r, g, and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
+//     Note: 16 bit input is truncated to 14 bits before conversion to yuv.
+// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
+// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels.  Should
+//     point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
+//     otherwise.
+// y_stride, u_stride, v_stride: distance in bytes between two vertically
+//     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
+//     should be multiples of 2.
+// width, height: width and height of the image in pixels
+int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
+                    int rgb_step, int rgb_stride, int rgb_bit_depth,
+                    void* y_ptr, int y_stride, void* u_ptr, int u_stride,
+                    void* v_ptr, int v_stride, int yuv_bit_depth, int width,
+                    int height, const SharpYuvConversionMatrix* yuv_matrix);
+
+// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
+// support (it's rarely used in practice, especially for images).
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_H_
--- a/sharpyuv/sharpyuv_csp.c
+++ b/sharpyuv/sharpyuv_csp.c
@ -0,0 +1,110 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#include "sharpyuv/sharpyuv_csp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
+
+void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
+                                     SharpYuvConversionMatrix* matrix) {
+  const float kr = yuv_color_space->kr;
+  const float kb = yuv_color_space->kb;
+  const float kg = 1.0f - kr - kb;
+  const float cr = 0.5f / (1.0f - kb);
+  const float cb = 0.5f / (1.0f - kr);
+
+  const int shift = yuv_color_space->bit_depth - 8;
+
+  const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
+  float scale_y = 1.0f;
+  float add_y = 0.0f;
+  float scale_u = cr;
+  float scale_v = cb;
+  float add_uv = (float)(128 << shift);
+  assert(yuv_color_space->bit_depth >= 8);
+
+  if (yuv_color_space->range == kSharpYuvRangeLimited) {
+    scale_y *= (219 << shift) / denom;
+    scale_u *= (224 << shift) / denom;
+    scale_v *= (224 << shift) / denom;
+    add_y = (float)(16 << shift);
+  }
+
+  matrix->rgb_to_y[0] = ToFixed16(kr * scale_y);
+  matrix->rgb_to_y[1] = ToFixed16(kg * scale_y);
+  matrix->rgb_to_y[2] = ToFixed16(kb * scale_y);
+  matrix->rgb_to_y[3] = ToFixed16(add_y);
+
+  matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u);
+  matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u);
+  matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u);
+  matrix->rgb_to_u[3] = ToFixed16(add_uv);
+
+  matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v);
+  matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v);
+  matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v);
+  matrix->rgb_to_v[3] = ToFixed16(add_uv);
+}
+
+// Matrices are in YUV_FIX fixed point precision.
+// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
+static const SharpYuvConversionMatrix kWebpMatrix = {
+  {16839, 33059, 6420, 16 << 16},
+  {-9719, -19081, 28800, 128 << 16},
+  {28800, -24116, -4684, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
+  {16829, 33039, 6416, 16 << 16},
+  {-9714, -19071, 28784, 128 << 16},
+  {28784, -24103, -4681, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec601FullMatrix = {
+  {19595, 38470, 7471, 0},
+  {-11058, -21710, 32768, 128 << 16},
+  {32768, -27439, -5329, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
+  {11966, 40254, 4064, 16 << 16},
+  {-6596, -22189, 28784, 128 << 16},
+  {28784, -26145, -2639, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec709FullMatrix = {
+  {13933, 46871, 4732, 0},
+  {-7509, -25259, 32768, 128 << 16},
+  {32768, -29763, -3005, 128 << 16},
+};
+
+const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+    SharpYuvMatrixType matrix_type) {
+  switch (matrix_type) {
+    case kSharpYuvMatrixWebp:
+      return &kWebpMatrix;
+    case kSharpYuvMatrixRec601Limited:
+      return &kRec601LimitedMatrix;
+    case kSharpYuvMatrixRec601Full:
+      return &kRec601FullMatrix;
+    case kSharpYuvMatrixRec709Limited:
+      return &kRec709LimitedMatrix;
+    case kSharpYuvMatrixRec709Full:
+      return &kRec709FullMatrix;
+    case kSharpYuvMatrixNum:
+      return NULL;
+  }
+  return NULL;
+}
--- a/sharpyuv/sharpyuv_csp.h
+++ b/sharpyuv/sharpyuv_csp.h
@ -0,0 +1,59 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_CSP_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Range of YUV values.
+typedef enum {
+  kSharpYuvRangeFull,     // YUV values between [0;255] (for 8 bit)
+  kSharpYuvRangeLimited   // Y in [16;235], YUV in [16;240] (for 8 bit)
+} SharpYuvRange;
+
+// Constants that define a YUV color space.
+typedef struct {
+  // Kr and Kb are defined such that:
+  // Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
+  float kr;
+  float kb;
+  int bit_depth;  // 8, 10 or 12
+  SharpYuvRange range;
+} SharpYuvColorSpace;
+
+// Fills in 'matrix' for the given YUVColorSpace.
+void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
+                                     SharpYuvConversionMatrix* matrix);
+
+// Enums for precomputed conversion matrices.
+typedef enum {
+  kSharpYuvMatrixWebp = 0,
+  kSharpYuvMatrixRec601Limited,
+  kSharpYuvMatrixRec601Full,
+  kSharpYuvMatrixRec709Limited,
+  kSharpYuvMatrixRec709Full,
+  kSharpYuvMatrixNum
+} SharpYuvMatrixType;
+
+// Returns a pointer to a matrix for one of the predefined colorspaces.
+const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+    SharpYuvMatrixType matrix_type);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_CSP_H_
--- a/sharpyuv/sharpyuv_dsp.c
+++ b/sharpyuv/sharpyuv_dsp.c
@ -0,0 +1,102 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "src/dsp/cpu.h"
+
+//-----------------------------------------------------------------------------
+
+#if !WEBP_NEON_OMIT_C_CODE
+static uint16_t clip(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src,
+                                  uint16_t* dst, int len, int bit_depth) {
+  uint64_t diff = 0;
+  int i;
+  const int max_y = (1 << bit_depth) - 1;
+  for (i = 0; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip(new_y, max_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src,
+                                int16_t* dst, int len) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len,
+                                const uint16_t* best_y, uint16_t* out,
+                                int bit_depth) {
+  int i;
+  const int max_y = (1 << bit_depth) - 1;
+  for (i = 0; i < len; ++i, ++A, ++B) {
+    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+    out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//-----------------------------------------------------------------------------
+
+uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+                            uint16_t* dst, int len, int bit_depth);
+void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
+                          int len);
+void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+                          const uint16_t* best_y, uint16_t* out,
+                          int bit_depth);
+
+extern void InitSharpYuvSSE2(void);
+extern void InitSharpYuvNEON(void);
+
+void SharpYuvInitDsp(VP8CPUInfo cpu_info_func) {
+  (void)cpu_info_func;
+
+#if !WEBP_NEON_OMIT_C_CODE
+  SharpYuvUpdateY = SharpYuvUpdateY_C;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
+  SharpYuvFilterRow = SharpYuvFilterRow_C;
+#endif
+
+#if defined(WEBP_HAVE_SSE2)
+  if (cpu_info_func == NULL || cpu_info_func(kSSE2)) {
+    InitSharpYuvSSE2();
+  }
+#endif  // WEBP_HAVE_SSE2
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE || cpu_info_func == NULL || cpu_info_func(kNEON)) {
+    InitSharpYuvNEON();
+  }
+#endif  // WEBP_HAVE_NEON
+
+  assert(SharpYuvUpdateY != NULL);
+  assert(SharpYuvUpdateRGB != NULL);
+  assert(SharpYuvFilterRow != NULL);
+}
--- a/sharpyuv/sharpyuv_dsp.h
+++ b/sharpyuv/sharpyuv_dsp.h
@ -0,0 +1,29 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_DSP_H_
+
+#include <stdint.h>
+
+#include "src/dsp/cpu.h"
+
+extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+                                   uint16_t* dst, int len, int bit_depth);
+extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref,
+                                 int16_t* dst, int len);
+extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+                                 const uint16_t* best_y, uint16_t* out,
+                                 int bit_depth);
+
+void SharpYuvInitDsp(VP8CPUInfo cpu_info_func);
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_DSP_H_
--- a/sharpyuv/sharpyuv_gamma.c
+++ b/sharpyuv/sharpyuv_gamma.c
@ -0,0 +1,114 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#include "sharpyuv/sharpyuv_gamma.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "src/webp/types.h"
+
+// Gamma correction compensates loss of resolution during chroma subsampling.
+// Size of pre-computed table for converting from gamma to linear.
+#define GAMMA_TO_LINEAR_TAB_BITS 10
+#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS)
+static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2];
+#define LINEAR_TO_GAMMA_TAB_BITS 9
+#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS)
+static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2];
+
+static const double kGammaF = 1. / 0.45;
+#define GAMMA_TO_LINEAR_BITS 16
+
+static volatile int kGammaTablesSOk = 0;
+void SharpYuvInitGammaTables(void) {
+  assert(GAMMA_TO_LINEAR_BITS <= 16);
+  if (!kGammaTablesSOk) {
+    int v;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+    // Precompute gamma to linear table.
+    {
+      const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE;
+      const double a_rec = 1. / (1. + a);
+      for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) {
+        const double g = norm * v;
+        double value;
+        if (g <= thresh * 4.5) {
+          value = g / 4.5;
+        } else {
+          value = pow(a_rec * (g + a), kGammaF);
+        }
+        kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+      }
+      // to prevent small rounding errors to cause read-overflow:
+      kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] =
+          kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE];
+    }
+    // Precompute linear to gamma table.
+    {
+      const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE;
+      for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) {
+        const double g = scale * v;
+        double value;
+        if (g <= thresh) {
+          value = 4.5 * g;
+        } else {
+          value = (1. + a) * pow(g, 1. / kGammaF) - a;
+        }
+        kLinearToGammaTabS[v] =
+            (uint32_t)(final_scale * value + 0.5);
+      }
+      // to prevent small rounding errors to cause read-overflow:
+      kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] =
+          kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE];
+    }
+    kGammaTablesSOk = 1;
+  }
+}
+
+static WEBP_INLINE int Shift(int v, int shift) {
+  return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
+                                                    int tab_pos_shift_right,
+                                                    int tab_value_shift) {
+  const uint32_t tab_pos = Shift(v, -tab_pos_shift_right);
+  // fractional part, in 'tab_pos_shift' fixed-point precision
+  const uint32_t x = v - (tab_pos << tab_pos_shift_right);  // fractional part
+  // v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
+  const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift);
+  const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift);
+  // Final interpolation.
+  const uint32_t v2 = (v1 - v0) * x;  // note: v1 >= v0.
+  const int half =
+      (tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0;
+  const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right);
+  return result;
+}
+
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth) {
+  const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
+  if (shift > 0) {
+    return kGammaToLinearTabS[v << shift];
+  }
+  return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
+}
+
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth) {
+  return FixedPointInterpolation(
+      value, kLinearToGammaTabS,
+      (GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
+      bit_depth - GAMMA_TO_LINEAR_BITS);
+}
--- a/sharpyuv/sharpyuv_gamma.h
+++ b/sharpyuv/sharpyuv_gamma.h
@ -0,0 +1,35 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initializes precomputed tables. Must be called once before calling
+// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
+void SharpYuvInitGammaTables(void);
+
+// Converts a gamma color value on 'bit_depth' bits to a 16 bit linear value.
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth);
+
+// Converts a 16 bit linear color value to a gamma value on 'bit_depth' bits.
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
--- a/sharpyuv/sharpyuv_neon.c
+++ b/sharpyuv/sharpyuv_neon.c
@ -0,0 +1,182 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_NEON)
+#include <assert.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+#endif
+
+extern void InitSharpYuvNEON(void);
+
+#if defined(WEBP_USE_NEON)
+
+static uint16_t clip_NEON(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(max_y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_NEON(new_y, max_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const int16x8_t max = vdupq_n_s16(max_y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t e0 = vrhaddq_s16(c1, a0);
+    const int16x8_t e1 = vrhaddq_s16(c0, a1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const uint16x8_t max = vdupq_n_u16(max_y);
+  for (i = 0; i + 4 <= len; i += 4) {
+    const int16x4_t a0 = vld1_s16(A + i + 0);
+    const int16x4_t a1 = vld1_s16(A + i + 1);
+    const int16x4_t b0 = vld1_s16(B + i + 0);
+    const int16x4_t b1 = vld1_s16(B + i + 1);
+    const int32x4_t a0b1 = vaddl_s16(a0, b1);
+    const int32x4_t a1b0 = vaddl_s16(a1, b0);
+    const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0);  // A0+A1+B0+B1
+    const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1);    // 2*(A0+B1)
+    const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0);    // 2*(A1+B0)
+    const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3);
+    const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3);
+    const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0));
+    const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1));
+    const int32x4x2_t f = vzipq_s32(e0, e1);
+
+    const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i));
+    const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g));
+    const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g));
+    const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1));
+    const uint16x8_t i_clamped = vminq_u16(i_16, max);
+    vst1q_u16(out + 2 * i + 0, i_clamped);
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out,
+                                   int bit_depth) {
+  if (bit_depth <= 10) {
+    SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth);
+  } else {
+    SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
+  SharpYuvUpdateY = SharpYuvUpdateY_NEON;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
+  SharpYuvFilterRow = SharpYuvFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+void InitSharpYuvNEON(void) {}
+
+#endif  // WEBP_USE_NEON
--- a/sharpyuv/sharpyuv_sse2.c
+++ b/sharpyuv/sharpyuv_sse2.c
@ -0,0 +1,204 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>
+#include <emmintrin.h>
+#endif
+
+extern void InitSharpYuvSSE2(void);
+
+#if defined(WEBP_USE_SSE2)
+
+static uint16_t clip_SSE2(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  uint64_t diff = 0;
+  uint32_t tmp[4];
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i sum = zero;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
+    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
+    const __m128i F = _mm_add_epi16(C, D);       // new_y
+    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
+    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
+    _mm_storeu_si128((__m128i*)(dst + i), H);
+    sum = _mm_add_epi32(sum, I);
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_SSE2(new_y, max_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i = 0;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
+    const __m128i E = _mm_add_epi16(C, D);   // new_uv
+    _mm_storeu_si128((__m128i*)(dst + i), E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const __m128i kCst8 = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+    const __m128i a0b1 = _mm_add_epi16(a0, b1);
+    const __m128i a1b0 = _mm_add_epi16(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi16(c1, a0);
+    const __m128i d1 = _mm_add_epi16(c0, a1);
+    const __m128i e0 = _mm_srai_epi16(d0, 1);
+    const __m128i e1 = _mm_srai_epi16(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+    const __m128i h0 = _mm_add_epi16(g0, f0);
+    const __m128i h1 = _mm_add_epi16(g1, f1);
+    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static WEBP_INLINE __m128i s16_to_s32(__m128i in) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16);
+}
+
+static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const __m128i kCst8 = _mm_set1_epi32(8);
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= len; i += 4) {
+    const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0)));
+    const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1)));
+    const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0)));
+    const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1)));
+    const __m128i a0b1 = _mm_add_epi32(a0, b1);
+    const __m128i a1b0 = _mm_add_epi32(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1);  // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0);  // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi32(c1, a0);
+    const __m128i d1 = _mm_add_epi32(c0, a1);
+    const __m128i e0 = _mm_srai_epi32(d0, 1);
+    const __m128i e1 = _mm_srai_epi32(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi32(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi32(e0, e1);
+    const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1));
+    const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), final);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out,
+                                   int bit_depth) {
+  if (bit_depth <= 10) {
+    SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth);
+  } else {
+    SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
+  SharpYuvUpdateY = SharpYuvUpdateY_SSE2;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2;
+  SharpYuvFilterRow = SharpYuvFilterRow_SSE2;
+}
+#else  // !WEBP_USE_SSE2
+
+void InitSharpYuvSSE2(void) {}
+
+#endif  // WEBP_USE_SSE2
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -36,7 +36,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 8:3:1
+libwebp_la_LDFLAGS = -no-undefined -version-info 8:4:1
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -48,7 +48,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 4:3:1
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 4:4:1
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/vp8i_dec.h
+++ b/src/dec/vp8i_dec.h
@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 1
 #define DEC_MIN_VERSION 2
-#define DEC_REV_VERSION 2
+#define DEC_REV_VERSION 3

 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@ -178,7 +178,7 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {

 //------------------------------------------------------------------------------
 // Decodes the next Huffman code from bit-stream.
-// FillBitWindow(br) needs to be called at minimum every second call
+// VP8LFillBitWindow(br) needs to be called at minimum every second call
 // to ReadSymbol, in order to pre-fetch enough bits.
 static WEBP_INLINE int ReadSymbol(const HuffmanCode* table,
                                  VP8LBitReader* const br) {
@ -253,11 +253,11 @@ static int ReadHuffmanCodeLengths(
  int symbol;
  int max_symbol;
  int prev_code_len = DEFAULT_CODE_LENGTH;
-  HuffmanCode table[1 << LENGTHS_TABLE_BITS];
+  HuffmanTables tables;

-  if (!VP8LBuildHuffmanTable(table, LENGTHS_TABLE_BITS,
-                             code_length_code_lengths,
-                             NUM_CODE_LENGTH_CODES)) {
+  if (!VP8LHuffmanTablesAllocate(1 << LENGTHS_TABLE_BITS, &tables) ||
+      !VP8LBuildHuffmanTable(&tables, LENGTHS_TABLE_BITS,
+                             code_length_code_lengths, NUM_CODE_LENGTH_CODES)) {
    goto End;
  }

@ -277,7 +277,7 @@ static int ReadHuffmanCodeLengths(
    int code_len;
    if (max_symbol-- == 0) break;
    VP8LFillBitWindow(br);
-    p = &table[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
+    p = &tables.curr_segment->start[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
    VP8LSetBitPos(br, br->bit_pos_ + p->bits);
    code_len = p->value;
    if (code_len < kCodeLengthLiterals) {
@ -300,6 +300,7 @@ static int ReadHuffmanCodeLengths(
  ok = 1;

 End:
+  VP8LHuffmanTablesDeallocate(&tables);
  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
  return ok;
 }
@ -307,7 +308,8 @@ static int ReadHuffmanCodeLengths(
 // 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
 // tree.
 static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
-                           int* const code_lengths, HuffmanCode* const table) {
+                           int* const code_lengths,
+                           HuffmanTables* const table) {
  int ok = 0;
  int size = 0;
  VP8LBitReader* const br = &dec->br_;
@ -321,7 +323,7 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
    // The first code is either 1 bit or 8 bit code.
    int symbol = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
    code_lengths[symbol] = 1;
-    // The second code (if present), is always 8 bit long.
+    // The second code (if present), is always 8 bits long.
    if (num_symbols == 2) {
      symbol = VP8LReadBits(br, 8);
      code_lengths[symbol] = 1;
@ -362,8 +364,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
  VP8LMetadata* const hdr = &dec->hdr_;
  uint32_t* huffman_image = NULL;
  HTreeGroup* htree_groups = NULL;
-  HuffmanCode* huffman_tables = NULL;
-  HuffmanCode* huffman_table = NULL;
+  HuffmanTables* huffman_tables = &hdr->huffman_tables_;
  int num_htree_groups = 1;
  int num_htree_groups_max = 1;
  int max_alphabet_size = 0;
@ -372,6 +373,10 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
  int* mapping = NULL;
  int ok = 0;

+  // Check the table has been 0 initialized (through InitMetadata).
+  assert(huffman_tables->root.start == NULL);
+  assert(huffman_tables->curr_segment == NULL);
+
  if (allow_recursion && VP8LReadBits(br, 1)) {
    // use meta Huffman codes.
    const int huffman_precision = VP8LReadBits(br, 3) + 2;
@ -434,16 +439,15 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,

  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
                                      sizeof(*code_lengths));
-  huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size,
-                                                sizeof(*huffman_tables));
  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);

-  if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
+  if (htree_groups == NULL || code_lengths == NULL ||
+      !VP8LHuffmanTablesAllocate(num_htree_groups * table_size,
+                                 huffman_tables)) {
    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
    goto Error;
  }

-  huffman_table = huffman_tables;
  for (i = 0; i < num_htree_groups_max; ++i) {
    // If the index "i" is unused in the Huffman image, just make sure the
    // coefficients are valid but do not store them.
@ -468,19 +472,20 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
      int max_bits = 0;
      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
        int alphabet_size = kAlphabetSize[j];
-        htrees[j] = huffman_table;
        if (j == 0 && color_cache_bits > 0) {
          alphabet_size += (1 << color_cache_bits);
        }
-        size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table);
+        size =
+            ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables);
+        htrees[j] = huffman_tables->curr_segment->curr_table;
        if (size == 0) {
          goto Error;
        }
        if (is_trivial_literal && kLiteralMap[j] == 1) {
-          is_trivial_literal = (huffman_table->bits == 0);
+          is_trivial_literal = (htrees[j]->bits == 0);
        }
-        total_size += huffman_table->bits;
-        huffman_table += size;
+        total_size += htrees[j]->bits;
+        huffman_tables->curr_segment->curr_table += size;
        if (j <= ALPHA) {
          int local_max_bits = code_lengths[0];
          int k;
@ -515,14 +520,13 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
  hdr->huffman_image_ = huffman_image;
  hdr->num_htree_groups_ = num_htree_groups;
  hdr->htree_groups_ = htree_groups;
-  hdr->huffman_tables_ = huffman_tables;

 Error:
  WebPSafeFree(code_lengths);
  WebPSafeFree(mapping);
  if (!ok) {
    WebPSafeFree(huffman_image);
-    WebPSafeFree(huffman_tables);
+    VP8LHuffmanTablesDeallocate(huffman_tables);
    VP8LHtreeGroupsFree(htree_groups);
  }
  return ok;
@ -1281,7 +1285,7 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
    uint8_t* const new_data = (uint8_t*)new_color_map;
    new_color_map[0] = transform->data_[0];
    for (i = 4; i < 4 * num_colors; ++i) {
-      // Equivalent to AddPixelEq(), on a byte-basis.
+      // Equivalent to VP8LAddPixels(), on a byte-basis.
      new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
    }
    for (; i < 4 * final_num_colors; ++i) {
@ -1358,7 +1362,7 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
  assert(hdr != NULL);

  WebPSafeFree(hdr->huffman_image_);
-  WebPSafeFree(hdr->huffman_tables_);
+  VP8LHuffmanTablesDeallocate(&hdr->huffman_tables_);
  VP8LHtreeGroupsFree(hdr->htree_groups_);
  VP8LColorCacheClear(&hdr->color_cache_);
  VP8LColorCacheClear(&hdr->saved_color_cache_);
@ -1673,7 +1677,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

  if (dec == NULL) return 0;

-  assert(dec->hdr_.huffman_tables_ != NULL);
+  assert(dec->hdr_.huffman_tables_.root.start != NULL);
  assert(dec->hdr_.htree_groups_ != NULL);
  assert(dec->hdr_.num_htree_groups_ > 0);

--- a/src/dec/vp8li_dec.h
+++ b/src/dec/vp8li_dec.h
@ -51,7 +51,7 @@ typedef struct {
  uint32_t*       huffman_image_;
  int             num_htree_groups_;
  HTreeGroup*     htree_groups_;
-  HuffmanCode*    huffman_tables_;
+  HuffmanTables   huffman_tables_;
 } VP8LMetadata;

 typedef struct VP8LDecoder VP8LDecoder;
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -13,6 +13,6 @@ noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:9:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:10:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/anim_decode.c
+++ b/src/demux/anim_decode.c
@ -23,6 +23,14 @@

 #define NUM_CHANNELS 4

+// Channel extraction from a uint32_t representation of a uint8_t RGBA/BGRA
+// buffer.
+#ifdef WORDS_BIGENDIAN
+#define CHANNEL_SHIFT(i) (24 - (i) * 8)
+#else
+#define CHANNEL_SHIFT(i) ((i) * 8)
+#endif
+
 typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
 static void BlendPixelRowNonPremult(uint32_t* const src,
                                    const uint32_t* const dst, int num_pixels);
@ -209,35 +217,35 @@ static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
  const uint8_t dst_channel = (dst >> shift) & 0xff;
  const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
  assert(blend_unscaled < (1ULL << 32) / scale);
-  return (blend_unscaled * scale) >> 24;
+  return (blend_unscaled * scale) >> CHANNEL_SHIFT(3);
 }

 // Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
 static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;

  if (src_a == 0) {
    return dst;
  } else {
-    const uint8_t dst_a = (dst >> 24) & 0xff;
+    const uint8_t dst_a = (dst >> CHANNEL_SHIFT(3)) & 0xff;
    // This is the approximate integer arithmetic for the actual formula:
    // dst_factor_a = (dst_a * (255 - src_a)) / 255.
    const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
    const uint8_t blend_a = src_a + dst_factor_a;
    const uint32_t scale = (1UL << 24) / blend_a;

-    const uint8_t blend_r =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
-    const uint8_t blend_g =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
-    const uint8_t blend_b =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    const uint8_t blend_r = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(0));
+    const uint8_t blend_g = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(1));
+    const uint8_t blend_b = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(2));
    assert(src_a + dst_factor_a < 256);

-    return (blend_r << 0) |
-           (blend_g << 8) |
-           (blend_b << 16) |
-           ((uint32_t)blend_a << 24);
+    return ((uint32_t)blend_r << CHANNEL_SHIFT(0)) |
+           ((uint32_t)blend_g << CHANNEL_SHIFT(1)) |
+           ((uint32_t)blend_b << CHANNEL_SHIFT(2)) |
+           ((uint32_t)blend_a << CHANNEL_SHIFT(3));
  }
 }

@ -247,7 +255,7 @@ static void BlendPixelRowNonPremult(uint32_t* const src,
                                    const uint32_t* const dst, int num_pixels) {
  int i;
  for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
    if (src_alpha != 0xff) {
      src[i] = BlendPixelNonPremult(src[i], dst[i]);
    }
@ -264,7 +272,7 @@ static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {

 // Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
 static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
  return src + ChannelwiseMultiply(dst, 256 - src_a);
 }

@ -274,7 +282,7 @@ static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
                                 int num_pixels) {
  int i;
  for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
    if (src_alpha != 0xff) {
      src[i] = BlendPixelPremult(src[i], dst[i]);
    }
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -25,7 +25,7 @@

 #define DMUX_MAJ_VERSION 1
 #define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 2
+#define DMUX_REV_VERSION 3

 typedef struct {
  size_t start_;        // start location of the data
@ -614,7 +614,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {

  while (f != NULL) {
    const int cur_frame_set = f->frame_num_;
-    int frame_count = 0;

    // Check frame properties.
    for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
@ -649,8 +648,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
                            dmux->canvas_width_, dmux->canvas_height_)) {
        return 0;
      }
-
-      ++frame_count;
    }
  }
  return 1;
--- a/src/demux/libwebpdemux.rc
+++ b/src/demux/libwebpdemux.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,0,2,2
- PRODUCTVERSION 1,0,2,2
+ FILEVERSION 1,0,2,3
+ PRODUCTVERSION 1,0,2,3
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebpdemux DLL"
-            VALUE "FileVersion", "1.2.2"
+            VALUE "FileVersion", "1.2.3"
            VALUE "InternalName", "libwebpdemux.dll"
-            VALUE "LegalCopyright", "Copyright (C) 2021"
+            VALUE "LegalCopyright", "Copyright (C) 2022"
            VALUE "OriginalFilename", "libwebpdemux.dll"
            VALUE "ProductName", "WebP Image Demuxer"
-            VALUE "ProductVersion", "1.2.2"
+            VALUE "ProductVersion", "1.2.3"
        END
    END
    BLOCK "VarFileInfo"
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -24,6 +24,7 @@ commondir = $(includedir)/webp
 COMMON_SOURCES =
 COMMON_SOURCES += alpha_processing.c
 COMMON_SOURCES += cpu.c
+COMMON_SOURCES += cpu.h
 COMMON_SOURCES += dec.c
 COMMON_SOURCES += dec_clip_tables.c
 COMMON_SOURCES += dsp.h
--- a/src/dsp/alpha_processing_neon.c
+++ b/src/dsp/alpha_processing_neon.c
@ -83,7 +83,7 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
 static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
                              int alpha_stride, int width, int height,
                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
-  uint32_t alpha_mask = 0xffffffffu;
+  uint32_t alpha_mask = 0xffu;
  uint8x8_t mask8 = vdup_n_u8(0xff);
  uint32_t tmp[2];
  int i, j;
@ -107,6 +107,7 @@ static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
    dst += dst_stride;
  }
  vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask *= 0x01010101;
  alpha_mask &= tmp[0];
  alpha_mask &= tmp[1];
  return (alpha_mask != 0xffffffffu);
@ -135,7 +136,7 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
 static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                             int width, int height,
                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
-  uint32_t alpha_mask = 0xffffffffu;
+  uint32_t alpha_mask = 0xffu;
  uint8x8_t mask8 = vdup_n_u8(0xff);
  uint32_t tmp[2];
  int i, j;
@ -157,6 +158,7 @@ static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
    alpha += alpha_stride;
  }
  vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask *= 0x01010101;
  alpha_mask &= tmp[0];
  alpha_mask &= tmp[1];
  return (alpha_mask == 0xffffffffu);
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#include "src/dsp/dsp.h"
+#include "src/dsp/cpu.h"

 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
--- a/src/dsp/cpu.h
+++ b/src/dsp/cpu.h
@ -0,0 +1,254 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   CPU detection functions and macros.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_CPU_H_
+#define WEBP_DSP_CPU_H_
+
+#ifdef HAVE_CONFIG_H
+#include "src/webp/config.h"
+#endif
+
+#include "src/webp/types.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#if defined(__clang__)
+#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+#define LOCAL_CLANG_PREREQ(maj, min) \
+  (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_CLANG_VERSION 0
+#define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if !defined(HAVE_CONFIG_H)
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
+#define WEBP_USE_SSE41
+#endif
+
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) &&       \
+     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
+    !defined(__native_client__)
+#define WEBP_USE_NEON
+#endif
+
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
+// vtbl4_u8(); a fix was made in 16.6.
+#if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+                          (_MSC_VER >= 1926 && defined(_M_ARM64)))
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \
+    (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define WEBP_USE_MSA
+#endif
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \
+      defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define WEBP_MSAN
+#endif
+#endif
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define WEBP_DSP_INIT(func)                                         \
+  do {                                                              \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used =           \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;                      \
+    static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&func##_lock)) break;                    \
+    if (func##_last_cpuinfo_used != VP8GetCPUInfo) func();          \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;                       \
+    (void)pthread_mutex_unlock(&func##_lock);                       \
+  } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define WEBP_DSP_INIT(func)                               \
+  do {                                                    \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used = \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;            \
+    if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \
+    func();                                               \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;             \
+  } while (0)
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Defines an Init + helper function that control multiple initialization of
+// function pointers / tables.
+/* Usage:
+   WEBP_DSP_INIT_FUNC(InitFunc) {
+     ...function body
+   }
+*/
+#define WEBP_DSP_INIT_FUNC(name)                                            \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void);                  \
+  WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void)
+
+#define WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures. This is only meant to silence unaligned loads on platforms that
+// are known to support them.
+#undef WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined")))
+
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures related to unsigned integer overflows. This is only meant to
+// silence cases where this well defined behavior is expected.
+#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) &&                   \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kSlowSSSE3,  // special feature for slow SSSE3 architectures
+  kSSE4_1,
+  kAVX,
+  kAVX2,
+  kNEON,
+  kMIPS32,
+  kMIPSdspR2,
+  kMSA
+} CPUFeature;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_CPU_H_
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -18,6 +18,7 @@
 #include "src/webp/config.h"
 #endif

+#include "src/dsp/cpu.h"
 #include "src/webp/types.h"

 #ifdef __cplusplus
@ -43,225 +44,6 @@ extern "C" {
 #define WEBP_RESTRICT
 #endif

-//------------------------------------------------------------------------------
-// CPU detection
-
-#if defined(__GNUC__)
-# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
-# define LOCAL_GCC_PREREQ(maj, min) \
-    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_GCC_VERSION 0
-# define LOCAL_GCC_PREREQ(maj, min) 0
-#endif
-
-#if defined(__clang__)
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif
-
-#ifndef __has_builtin
-# define __has_builtin(x) 0
-#endif
-
-#if !defined(HAVE_CONFIG_H)
-#if defined(_MSC_VER) && _MSC_VER > 1310 && \
-    (defined(_M_X64) || defined(_M_IX86))
-#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
-    (defined(_M_X64) || defined(_M_IX86))
-#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
-#endif
-#endif
-
-// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
-// files without intrinsics, allowing the corresponding Init() to be called.
-// Files containing intrinsics will need to be built targeting the instruction
-// set so should succeed on one of the earlier tests.
-#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
-    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
-#define WEBP_USE_SSE2
-#endif
-
-#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
-#define WEBP_HAVE_SSE2
-#endif
-
-#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
-    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
-#define WEBP_USE_SSE41
-#endif
-
-#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
-#define WEBP_HAVE_SSE41
-#endif
-
-#undef WEBP_MSC_SSE41
-#undef WEBP_MSC_SSE2
-
-// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
-// inline assembly would need to be modified for use with Native Client.
-#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
-     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
-    !defined(__native_client__)
-#define WEBP_USE_NEON
-#endif
-
-#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
-    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
-#define WEBP_ANDROID_NEON  // Android targets that may have NEON
-#define WEBP_USE_NEON
-#endif
-
-// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
-// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
-// arm_neon.h.
-#if defined(_MSC_VER) && \
-  ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
-   (_MSC_VER >= 1920 && defined(_M_ARM64)))
-#define WEBP_USE_NEON
-#define WEBP_USE_INTRINSICS
-#endif
-
-#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
-#define WEBP_HAVE_NEON
-#endif
-
-#if defined(__mips__) && !defined(__mips64) && \
-    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
-#define WEBP_USE_MIPS32
-#if (__mips_isa_rev >= 2)
-#define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
-#define WEBP_USE_MIPS_DSP_R2
-#endif
-#endif
-#endif
-
-#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
-#define WEBP_USE_MSA
-#endif
-
-#ifndef WEBP_DSP_OMIT_C_CODE
-#define WEBP_DSP_OMIT_C_CODE 1
-#endif
-
-#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
-#define WEBP_NEON_OMIT_C_CODE 1
-#else
-#define WEBP_NEON_OMIT_C_CODE 0
-#endif
-
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
-#define WEBP_NEON_WORK_AROUND_GCC 1
-#else
-#define WEBP_NEON_WORK_AROUND_GCC 0
-#endif
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#define WEBP_TSAN_IGNORE_FUNCTION
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#undef WEBP_TSAN_IGNORE_FUNCTION
-#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
-#endif
-#endif
-
-#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
-#include <pthread.h>  // NOLINT
-
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
-  if (pthread_mutex_lock(&func ## _lock)) break;                    \
-  if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func();          \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-  (void)pthread_mutex_unlock(&func ## _lock);                       \
-} while (0)
-#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break;           \
-  func();                                                           \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-} while (0)
-#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
-
-// Defines an Init + helper function that control multiple initialization of
-// function pointers / tables.
-/* Usage:
-   WEBP_DSP_INIT_FUNC(InitFunc) {
-     ...function body
-   }
-*/
-#define WEBP_DSP_INIT_FUNC(name)                             \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
-  WEBP_TSAN_IGNORE_FUNCTION void name(void) {                \
-    WEBP_DSP_INIT(name ## _body);                            \
-  }                                                          \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
-
-#define WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#if defined(__clang__) && defined(__has_attribute)
-#if __has_attribute(no_sanitize)
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures. This is only meant to silence unaligned loads on platforms that
-// are known to support them.
-#undef WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNDEF \
-  __attribute__((no_sanitize("undefined")))
-
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures related to unsigned integer overflows. This is only meant to
-// silence cases where this well defined behavior is expected.
-#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
-  __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
-#endif
-
-// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
-// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
-#if !defined(WEBP_OFFSET_PTR)
-#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
-#endif
-
-// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
-#if !defined(WEBP_SWAP_16BIT_CSP)
-#define WEBP_SWAP_16BIT_CSP 0
-#endif
-
-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
-typedef enum {
-  kSSE2,
-  kSSE3,
-  kSlowSSSE3,  // special feature for slow SSSE3 architectures
-  kSSE4_1,
-  kAVX,
-  kAVX2,
-  kNEON,
-  kMIPS32,
-  kMIPSdspR2,
-  kMSA
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;

 //------------------------------------------------------------------------------
 // Init stub generator
@ -550,15 +332,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
                                    uint8_t* u, uint8_t* v, int width);

-// utilities for accurate RGB->YUV conversion
-extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
-                                       uint16_t* dst, int len);
-extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
-                                     int16_t* dst, int len);
-extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
-                                     int len,
-                                     const uint16_t* best_y, uint16_t* out);
-
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);

--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -182,8 +182,8 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.

-typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
-typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+typedef float (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
                                      int length);
 typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
                                                const int Y[256]);
@ -198,7 +198,7 @@ typedef struct {        // small struct to hold counters
 } VP8LStreaks;

 typedef struct {            // small struct to hold bit entropy results
-  double entropy;           // entropy
+  float entropy;            // entropy
  uint32_t sum;             // sum of the population
  int nonzeros;             // number of non-zero elements in the population
  uint32_t max_val;         // maximum value in the population
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@ -402,7 +402,7 @@ static float FastLog2Slow_C(uint32_t v) {
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
  int i;
-  double retval = 0.;
+  float retval = 0.f;
  int sumX = 0, sumXY = 0;
  for (i = 0; i < 256; ++i) {
    const int x = X[i];
@ -418,7 +418,7 @@ static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
    }
  }
  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
+  return retval;
 }

 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
@ -636,17 +636,17 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,

 //------------------------------------------------------------------------------

-static double ExtraCost_C(const uint32_t* population, int length) {
+static float ExtraCost_C(const uint32_t* population, int length) {
  int i;
-  double cost = 0.;
+  float cost = 0.f;
  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
  return cost;
 }

-static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
                                  int length) {
  int i;
-  double cost = 0.;
+  float cost = 0.f;
  for (i = 2; i < length - 2; ++i) {
    const int xy = X[i + 2] + Y[i + 2];
    cost += (i >> 1) * xy;
--- a/src/dsp/lossless_enc_mips32.c
+++ b/src/dsp/lossless_enc_mips32.c
@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
 //     cost += i * *(pop + 1);
 //     pop += 2;
 //   }
-//   return (double)cost;
-static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
+//   return (float)cost;
+static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
  int i, temp0, temp1;
  const uint32_t* pop = &population[4];
  const uint32_t* const LoopEnd = &population[length];
@ -130,7 +130,7 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
    : "memory", "hi", "lo"
  );

-  return (double)((int64_t)temp0 << 32 | temp1);
+  return (float)((int64_t)temp0 << 32 | temp1);
 }

 // C version of this function:
@ -148,8 +148,8 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
 //     pX += 2;
 //     pY += 2;
 //   }
-//   return (double)cost;
-static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+//   return (float)cost;
+static float ExtraCostCombined_MIPS32(const uint32_t* const X,
                                      const uint32_t* const Y, int length) {
  int i, temp0, temp1, temp2, temp3;
  const uint32_t* pX = &X[4];
@ -183,7 +183,7 @@ static double ExtraCostCombined_MIPS32(const uint32_t* const X,
    : "memory", "hi", "lo"
  );

-  return (double)((int64_t)temp0 << 32 | temp1);
+  return (float)((int64_t)temp0 << 32 | temp1);
 }

 #define HUFFMAN_COST_PASS                                 \
@ -347,24 +347,24 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
 static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
                             uint32_t* pout, int size) {
  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const uint32_t end = ((size) / 4) * 4;
+  const int end = ((size) / 4) * 4;
  const uint32_t* const LoopEnd = pa + end;
  int i;
  ASM_START
  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
  ASM_END_0
-  for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+  for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
 }

 static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const uint32_t end = ((size) / 4) * 4;
+  const int end = ((size) / 4) * 4;
  const uint32_t* const LoopEnd = pa + end;
  int i;
  ASM_START
  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
  ASM_END_1
-  for (i = end; i < size; ++i) pout[i] += pa[i];
+  for (i = 0; i < size - end; ++i) pout[i] += pa[i];
 }

 #undef ASM_END_1
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -239,7 +239,7 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {

 static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
  int i;
-  double retval = 0.;
+  float retval = 0.f;
  int sumX = 0, sumXY = 0;
  const __m128i zero = _mm_setzero_si128();

@ -273,7 +273,7 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
    }
  }
  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
+  return retval;
 }

 #else
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -194,50 +194,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,

 //-----------------------------------------------------------------------------

-#if !WEBP_NEON_OMIT_C_CODE
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
-                                  uint16_t* dst, int len) {
-  uint64_t diff = 0;
-  int i;
-  for (i = 0; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)dst[i] + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)abs(diff_y);
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
-                                int16_t* dst, int len) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
-                                const uint16_t* best_y, uint16_t* out) {
-  int i;
-  for (i = 0; i < len; ++i, ++A, ++B) {
-    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
-    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-#endif  // !WEBP_NEON_OMIT_C_CODE
-
-#undef MAX_Y
-
-//-----------------------------------------------------------------------------
-
 void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
 void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
 void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@ -247,18 +203,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
                            int src_width, int do_store);

-uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
-                                uint16_t* dst, int len);
-void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
-                              int16_t* dst, int len);
-void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
-                              const uint16_t* best_y, uint16_t* out);
-
 extern void WebPInitConvertARGBToYUVSSE2(void);
 extern void WebPInitConvertARGBToYUVSSE41(void);
 extern void WebPInitConvertARGBToYUVNEON(void);
-extern void WebPInitSharpYUVSSE2(void);
-extern void WebPInitSharpYUVNEON(void);

 WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
  WebPConvertARGBToY = ConvertARGBToY_C;
@ -269,17 +216,10 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {

  WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;

-#if !WEBP_NEON_OMIT_C_CODE
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
-#endif
-
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_HAVE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitConvertARGBToYUVSSE2();
-      WebPInitSharpYUVSSE2();
    }
 #endif  // WEBP_HAVE_SSE2
 #if defined(WEBP_HAVE_SSE41)
@ -293,7 +233,6 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
  if (WEBP_NEON_OMIT_C_CODE ||
      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
    WebPInitConvertARGBToYUVNEON();
-    WebPInitSharpYUVNEON();
  }
 #endif  // WEBP_HAVE_NEON

@ -302,7 +241,4 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
  assert(WebPConvertRGB24ToY != NULL);
  assert(WebPConvertBGR24ToY != NULL);
  assert(WebPConvertRGBA32ToUV != NULL);
-  assert(WebPSharpYUVUpdateY != NULL);
-  assert(WebPSharpYUVUpdateRGB != NULL);
-  assert(WebPSharpYUVFilterRow != NULL);
 }
--- a/src/dsp/yuv_neon.c
+++ b/src/dsp/yuv_neon.c
@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
 }

-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y_NEON(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  int i;
-  const int16x8_t zero = vdupq_n_s16(0);
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  uint64x2_t sum = vdupq_n_u64(0);
-  uint64_t diff;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
-    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
-    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
-    const int16x8_t D = vsubq_s16(A, B);       // diff_y
-    const int16x8_t F = vaddq_s16(C, D);       // new_y
-    const uint16x8_t H =
-        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
-    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
-    vst1q_u16(dst + i, H);
-    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
-  }
-  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)(dst[i]) + diff_y;
-    dst[i] = clip_y_NEON(new_y);
-    diff += (uint64_t)(abs(diff_y));
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vld1q_s16(ref + i);
-    const int16x8_t B = vld1q_s16(src + i);
-    const int16x8_t C = vld1q_s16(dst + i);
-    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
-    const int16x8_t E = vaddq_s16(C, D);   // new_uv
-    vst1q_s16(dst + i, E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  const int16x8_t zero = vdupq_n_s16(0);
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t a0 = vld1q_s16(A + i + 0);
-    const int16x8_t a1 = vld1q_s16(A + i + 1);
-    const int16x8_t b0 = vld1q_s16(B + i + 0);
-    const int16x8_t b1 = vld1q_s16(B + i + 1);
-    const int16x8_t a0b1 = vaddq_s16(a0, b1);
-    const int16x8_t a1b0 = vaddq_s16(a1, b0);
-    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
-    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
-    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
-    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
-    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
-    const int16x8_t d0 = vaddq_s16(c1, a0);
-    const int16x8_t d1 = vaddq_s16(c0, a1);
-    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
-    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
-    const int16x8x2_t f = vzipq_s16(e0, e1);
-    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
-    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
-    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
-    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
-    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
-    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
-    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
-    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
-  }
-  for (; i < len; ++i) {
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
-  }
-}
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVNEON(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
-}
-
 #else  // !WEBP_USE_NEON

 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)

 #endif  // WEBP_USE_NEON
--- a/src/dsp/yuv_sse2.c
+++ b/src/dsp/yuv_sse2.c
@ -747,128 +747,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
 }

-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  uint64_t diff = 0;
-  uint32_t tmp[4];
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i max = _mm_set1_epi16(MAX_Y);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i sum = zero;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
-    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
-    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
-    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
-    const __m128i F = _mm_add_epi16(C, D);       // new_y
-    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
-    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
-    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
-    _mm_storeu_si128((__m128i*)(dst + i), H);
-    sum = _mm_add_epi32(sum, I);
-  }
-  _mm_storeu_si128((__m128i*)tmp, sum);
-  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)dst[i] + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)abs(diff_y);
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i = 0;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
-    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
-    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
-    const __m128i E = _mm_add_epi16(C, D);   // new_uv
-    _mm_storeu_si128((__m128i*)(dst + i), E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const __m128i kCst8 = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(MAX_Y);
-  const __m128i zero = _mm_setzero_si128();
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
-    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
-    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
-    const __m128i a0b1 = _mm_add_epi16(a0, b1);
-    const __m128i a1b0 = _mm_add_epi16(a1, b0);
-    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
-    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
-    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
-    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
-    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
-    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
-    const __m128i d0 = _mm_add_epi16(c1, a0);
-    const __m128i d1 = _mm_add_epi16(c0, a1);
-    const __m128i e0 = _mm_srai_epi16(d0, 1);
-    const __m128i e1 = _mm_srai_epi16(d1, 1);
-    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
-    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
-    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
-    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
-    const __m128i h0 = _mm_add_epi16(g0, f0);
-    const __m128i h1 = _mm_add_epi16(g1, f1);
-    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
-    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
-    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
-    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
-  }
-  for (; i < len; ++i) {
-    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
-    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
-    // We reuse the common sub-expressions.
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
-}
-
 #else  // !WEBP_USE_SSE2

 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)

 #endif  // WEBP_USE_SSE2
--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@ -37,6 +37,7 @@ libwebpencodeinclude_HEADERS += ../webp/types.h
 noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

+libwebpencode_la_LIBADD = ../../sharpyuv/libsharpyuv.la
 libwebpencode_la_LDFLAGS = -lm
 libwebpencode_la_CPPFLAGS = $(AM_CPPFLAGS)
 libwebpencodeincludedir = $(includedir)/webp
--- a/src/enc/alpha_enc.c
+++ b/src/enc/alpha_enc.c
@ -86,7 +86,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
  // a decoder bug related to alpha with color cache.
  // See: https://code.google.com/p/webp/issues/detail?id=239
  // Need to re-enable this later.
-  ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
+  ok = VP8LEncodeStream(&config, &picture, bw, /*use_cache=*/0);
  WebPPictureFree(&picture);
  ok = ok && !bw->error_;
  if (!ok) {
--- a/src/enc/backward_references_cost_enc.c
+++ b/src/enc/backward_references_cost_enc.c
@ -15,10 +15,11 @@
 //

 #include <assert.h>
+#include <float.h>

+#include "src/dsp/lossless_common.h"
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
-#include "src/dsp/lossless_common.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"

@ -30,15 +31,15 @@ extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                      const PixOrCopy v);

 typedef struct {
-  double alpha_[VALUES_IN_BYTE];
-  double red_[VALUES_IN_BYTE];
-  double blue_[VALUES_IN_BYTE];
-  double distance_[NUM_DISTANCE_CODES];
-  double* literal_;
+  float alpha_[VALUES_IN_BYTE];
+  float red_[VALUES_IN_BYTE];
+  float blue_[VALUES_IN_BYTE];
+  float distance_[NUM_DISTANCE_CODES];
+  float* literal_;
 } CostModel;

 static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], double output[]) {
+    int num_symbols, const uint32_t population_counts[], float output[]) {
  uint32_t sum = 0;
  int nonzeros = 0;
  int i;
@ -51,7 +52,7 @@ static void ConvertPopulationCountTableToBitEstimates(
  if (nonzeros <= 1) {
    memset(output, 0, num_symbols * sizeof(*output));
  } else {
-    const double logsum = VP8LFastLog2(sum);
+    const float logsum = VP8LFastLog2(sum);
    for (i = 0; i < num_symbols; ++i) {
      output[i] = logsum - VP8LFastLog2(population_counts[i]);
    }
@ -75,8 +76,8 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
  }

  ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(histo->palette_code_bits_),
-      histo->literal_, m->literal_);
+      VP8LHistogramNumCodes(histo->palette_code_bits_), histo->literal_,
+      m->literal_);
  ConvertPopulationCountTableToBitEstimates(
      VALUES_IN_BYTE, histo->red_, m->red_);
  ConvertPopulationCountTableToBitEstimates(
@ -92,26 +93,26 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
  return ok;
 }

-static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+static WEBP_INLINE float GetLiteralCost(const CostModel* const m, uint32_t v) {
  return m->alpha_[v >> 24] +
         m->red_[(v >> 16) & 0xff] +
         m->literal_[(v >> 8) & 0xff] +
         m->blue_[v & 0xff];
 }

-static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+static WEBP_INLINE float GetCacheCost(const CostModel* const m, uint32_t idx) {
  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
  return m->literal_[literal_idx];
 }

-static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+static WEBP_INLINE float GetLengthCost(const CostModel* const m,
                                       uint32_t length) {
  int code, extra_bits;
  VP8LPrefixEncodeBits(length, &code, &extra_bits);
  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
 }

-static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+static WEBP_INLINE float GetDistanceCost(const CostModel* const m,
                                         uint32_t distance) {
  int code, extra_bits;
  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
@ -122,20 +123,20 @@ static WEBP_INLINE void AddSingleLiteralWithCostModel(
    const uint32_t* const argb, VP8LColorCache* const hashers,
    const CostModel* const cost_model, int idx, int use_color_cache,
    float prev_cost, float* const cost, uint16_t* const dist_array) {
-  double cost_val = prev_cost;
+  float cost_val = prev_cost;
  const uint32_t color = argb[idx];
  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
  if (ix >= 0) {
    // use_color_cache is true and hashers contains color
-    const double mul0 = 0.68;
+    const float mul0 = 0.68f;
    cost_val += GetCacheCost(cost_model, ix) * mul0;
  } else {
-    const double mul1 = 0.82;
+    const float mul1 = 0.82f;
    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
    cost_val += GetLiteralCost(cost_model, color) * mul1;
  }
  if (cost[idx] > cost_val) {
-    cost[idx] = (float)cost_val;
+    cost[idx] = cost_val;
    dist_array[idx] = 1;  // only one is inserted.
  }
 }
@ -172,7 +173,7 @@ struct CostInterval {

 // The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
 typedef struct {
-  double cost_;
+  float cost_;
  int start_;
  int end_;       // Exclusive.
 } CostCacheInterval;
@ -187,7 +188,7 @@ typedef struct {
  int count_;  // The number of stored intervals.
  CostCacheInterval* cache_intervals_;
  size_t cache_intervals_size_;
-  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
  float* costs_;
  uint16_t* dist_array_;
  // Most of the time, we only need few intervals -> use a free-list, to avoid
@ -262,10 +263,13 @@ static int CostManagerInit(CostManager* const manager,
  CostManagerInitFreeList(manager);

  // Fill in the cost_cache_.
-  manager->cache_intervals_size_ = 1;
-  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
-  for (i = 1; i < cost_cache_size; ++i) {
+  // Has to be done in two passes due to a GCC bug on i686
+  // related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
+  for (i = 0; i < cost_cache_size; ++i) {
    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+  }
+  manager->cache_intervals_size_ = 1;
+  for (i = 1; i < cost_cache_size; ++i) {
    // Get the number of bound intervals.
    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
      ++manager->cache_intervals_size_;
@ -294,7 +298,7 @@ static int CostManagerInit(CostManager* const manager,
    cur->end_ = 1;
    cur->cost_ = manager->cost_cache_[0];
    for (i = 1; i < cost_cache_size; ++i) {
-      const double cost_val = manager->cost_cache_[i];
+      const float cost_val = manager->cost_cache_[i];
      if (cost_val != cur->cost_) {
        ++cur;
        // Initialize an interval.
@ -303,6 +307,8 @@ static int CostManagerInit(CostManager* const manager,
      }
      cur->end_ = i + 1;
    }
+    assert((size_t)(cur - manager->cache_intervals_) + 1 ==
+           manager->cache_intervals_size_);
  }

  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
@ -311,7 +317,7 @@ static int CostManagerInit(CostManager* const manager,
    return 0;
  }
  // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = FLT_MAX;

  return 1;
 }
@ -457,7 +463,7 @@ static WEBP_INLINE void InsertInterval(CostManager* const manager,
 // If handling the interval or one of its subintervals becomes to heavy, its
 // contribution is added to the costs right away.
 static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     double distance_cost, int position,
+                                     float distance_cost, int position,
                                     int len) {
  size_t i;
  CostInterval* interval = manager->head_;
@ -474,7 +480,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
      const int k = j - position;
      float cost_tmp;
      assert(k >= 0 && k < MAX_LENGTH);
-      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+      cost_tmp = distance_cost + manager->cost_cache_[k];

      if (manager->costs_[j] > cost_tmp) {
        manager->costs_[j] = cost_tmp;
@ -492,7 +498,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
    const int end = position + (cost_cache_intervals[i].end_ > len
                                 ? len
                                 : cost_cache_intervals[i].end_);
-    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+    const float cost = distance_cost + cost_cache_intervals[i].cost_;

    for (; interval != NULL && interval->start_ < end;
         interval = interval_next) {
@ -570,22 +576,21 @@ static int BackwardReferencesHashChainDistanceOnly(
  const int pix_count = xsize * ysize;
  const int use_color_cache = (cache_bits > 0);
  const size_t literal_array_size =
-      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
-                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+      sizeof(float) * (VP8LHistogramNumCodes(cache_bits));
  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
  CostModel* const cost_model =
      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
  VP8LColorCache hashers;
  CostManager* cost_manager =
-      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+      (CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
  int offset_prev = -1, len_prev = -1;
-  double offset_cost = -1;
+  float offset_cost = -1.f;
  int first_offset_is_constant = -1;  // initialized with 'impossible' value
  int reach = 0;

  if (cost_model == NULL || cost_manager == NULL) goto Error;

-  cost_model->literal_ = (double*)(cost_model + 1);
+  cost_model->literal_ = (float*)(cost_model + 1);
  if (use_color_cache) {
    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
    if (!cc_init) goto Error;
--- a/src/enc/backward_references_enc.c
+++ b/src/enc/backward_references_enc.c
@ -10,6 +10,8 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //

+#include "src/enc/backward_references_enc.h"
+
 #include <assert.h>
 #include <float.h>
 #include <math.h>
@ -17,10 +19,11 @@
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/encode.h"

 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references

@ -255,10 +258,13 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {

 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                      const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort) {
+                      int low_effort, const WebPPicture* const pic,
+                      int percent_range, int* const percent) {
  const int size = xsize * ysize;
  const int iter_max = GetMaxItersForQuality(quality);
  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int remaining_percent = percent_range;
+  int percent_start = *percent;
  int pos;
  int argb_comp;
  uint32_t base_position;
@ -276,7 +282,13 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,

  hash_to_first_index =
      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
-  if (hash_to_first_index == NULL) return 0;
+  if (hash_to_first_index == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return 0;
+  }
+
+  percent_range = remaining_percent / 2;
+  remaining_percent -= percent_range;

  // Set the int32_t array to -1.
  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
@ -323,12 +335,22 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
      hash_to_first_index[hash_code] = pos++;
      argb_comp = argb_comp_next;
    }
+
+    if (!WebPReportProgress(
+            pic, percent_start + percent_range * pos / (size - 2), percent)) {
+      WebPSafeFree(hash_to_first_index);
+      return 0;
+    }
  }
  // Process the penultimate pixel.
  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];

  WebPSafeFree(hash_to_first_index);

+  percent_start += percent_range;
+  if (!WebPReportProgress(pic, percent_start, percent)) return 0;
+  percent_range = remaining_percent;
+
  // Find the best match interval at each pixel, defined by an offset to the
  // pixel and a length. The right-most pixel cannot match anything to the right
  // (hence a best length of 0) and the left-most pixel nothing to the left
@ -417,8 +439,17 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
        max_base_position = base_position;
      }
    }
+
+    if (!WebPReportProgress(pic,
+                            percent_start + percent_range *
+                                                (size - 2 - base_position) /
+                                                (size - 2),
+                            percent)) {
+      return 0;
    }
-  return 1;
+  }
+
+  return WebPReportProgress(pic, percent_start + percent_range, percent);
 }

 static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
@ -728,7 +759,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
                                  int* const best_cache_bits) {
  int i;
  const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
-  double entropy_min = MAX_ENTROPY;
+  float entropy_min = MAX_ENTROPY;
  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
@ -813,7 +844,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
  }

  for (i = 0; i <= cache_bits_max; ++i) {
-    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    const float entropy = VP8LHistogramEstimateBits(histos[i]);
    if (i == 0 || entropy < entropy_min) {
      entropy_min = entropy;
      *best_cache_bits = i;
@ -890,7 +921,7 @@ static int GetBackwardReferences(int width, int height,
  int i, lz77_type;
  // Index 0 is for a color cache, index 1 for no cache (if needed).
  int lz77_types_best[2] = {0, 0};
-  double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
+  float bit_costs_best[2] = {FLT_MAX, FLT_MAX};
  VP8LHashChain hash_chain_box;
  VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
  int status = 0;
@ -902,7 +933,7 @@ static int GetBackwardReferences(int width, int height,
  for (lz77_type = 1; lz77_types_to_try;
       lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
    int res = 0;
-    double bit_cost = 0.;
+    float bit_cost = 0.f;
    if ((lz77_types_to_try & lz77_type) == 0) continue;
    switch (lz77_type) {
      case kLZ77RLE:
@ -976,17 +1007,18 @@ static int GetBackwardReferences(int width, int height,
      const VP8LHashChain* const hash_chain_tmp =
          (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
      const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
-      if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+      float bit_cost_trace;
+      if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
                                                hash_chain_tmp, &refs[i],
                                                refs_tmp)) {
-        double bit_cost_trace;
+        goto Error;
+      }
      VP8LHistogramCreate(histo, refs_tmp, cache_bits);
      bit_cost_trace = VP8LHistogramEstimateBits(histo);
      if (bit_cost_trace < bit_costs_best[i]) {
        BackwardRefsSwap(refs_tmp, &refs[i]);
      }
    }
-    }

    BackwardReferences2DLocality(width, &refs[i]);

@ -1006,25 +1038,31 @@ Error:
  return status;
 }

-WebPEncodingError VP8LGetBackwardReferences(
+int VP8LGetBackwardReferences(
    int width, int height, const uint32_t* const argb, int quality,
    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
-    int* const cache_bits_best) {
+    int* const cache_bits_best, const WebPPicture* const pic, int percent_range,
+    int* const percent) {
  if (low_effort) {
    VP8LBackwardRefs* refs_best;
    *cache_bits_best = cache_bits_max;
    refs_best = GetBackwardReferencesLowEffort(
        width, height, argb, cache_bits_best, hash_chain, refs);
-    if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    if (refs_best == NULL) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+      return 0;
+    }
    // Set it in first position.
    BackwardRefsSwap(refs_best, &refs[0]);
  } else {
    if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
                               cache_bits_max, do_no_cache, hash_chain, refs,
                               cache_bits_best)) {
-      return VP8_ENC_ERROR_OUT_OF_MEMORY;
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+      return 0;
    }
  }
-  return VP8_ENC_OK;
+
+  return WebPReportProgress(pic, *percent + percent_range, percent);
 }
--- a/src/enc/backward_references_enc.h
+++ b/src/enc/backward_references_enc.h
@ -134,10 +134,11 @@ struct VP8LHashChain {

 // Must be called first, to set size.
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
-// Pre-compute the best matches for argb.
+// Pre-compute the best matches for argb. pic and percent are for progress.
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                      const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort);
+                      int low_effort, const WebPPicture* const pic,
+                      int percent_range, int* const percent);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory

 static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
@ -227,11 +228,14 @@ enum VP8LLZ77Type {
 // VP8LBackwardRefs is put in the first element, the best value with no-cache in
 // the second element.
 // In both cases, the last element is used as temporary internally.
-WebPEncodingError VP8LGetBackwardReferences(
+// pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
+int VP8LGetBackwardReferences(
    int width, int height, const uint32_t* const argb, int quality,
    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
-    int* const cache_bits_best);
+    int* const cache_bits_best, const WebPPicture* const pic, int percent_range,
+    int* const percent);

 #ifdef __cplusplus
 }
--- a/src/enc/frame_enc.c
+++ b/src/enc/frame_enc.c
@ -778,6 +778,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
  // Roughly refresh the proba eight times per pass
  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
  int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
  const int do_search = enc->do_search_;
  VP8EncIterator it;
  VP8EncProba* const proba = &enc->proba_;
@ -805,6 +806,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
    uint64_t size_p0 = 0;
    uint64_t distortion = 0;
    int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
    VP8IteratorInit(enc, &it);
    SetLoopParams(enc, stats.q);
    if (is_last_pass) {
@ -832,7 +836,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
        StoreSideInfo(&it);
        VP8StoreFilterStats(&it);
        VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+        ok = VP8IteratorProgress(&it, pass_progress);
      }
      VP8IteratorSaveBoundary(&it);
    } while (ok && VP8IteratorNext(&it));
@ -878,7 +882,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                       (const uint8_t*)proba->coeffs_, 1);
  }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
  return PostLoopFinalize(&it, ok);
 }

--- a/src/enc/histogram_enc.c
+++ b/src/enc/histogram_enc.c
@ -13,15 +13,17 @@
 #include "src/webp/config.h"
 #endif

+#include <float.h>
 #include <math.h>

-#include "src/enc/backward_references_enc.h"
-#include "src/enc/histogram_enc.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"

-#define MAX_COST 1.e38
+#define MAX_BIT_COST FLT_MAX

 // Number of partitions for the three dominant (literal, red and blue) symbol
 // costs.
@ -228,8 +230,8 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
 // -----------------------------------------------------------------------------
 // Entropy-related functions.

-static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
-  double mix;
+static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  float mix;
  if (entropy->nonzeros < 5) {
    if (entropy->nonzeros <= 1) {
      return 0;
@ -238,65 +240,65 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
    // Let's mix in a bit of entropy to favor good clustering when
    // distributions of these are combined.
    if (entropy->nonzeros == 2) {
-      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+      return 0.99f * entropy->sum + 0.01f * entropy->entropy;
    }
    // No matter what the entropy says, we cannot be better than min_limit
    // with Huffman coding. I am mixing a bit of entropy into the
    // min_limit since it produces much better (~0.5 %) compression results
    // perhaps because of better entropy clustering.
    if (entropy->nonzeros == 3) {
-      mix = 0.95;
+      mix = 0.95f;
    } else {
-      mix = 0.7;  // nonzeros == 4.
+      mix = 0.7f;  // nonzeros == 4.
    }
  } else {
-    mix = 0.627;
+    mix = 0.627f;
  }

  {
-    double min_limit = 2 * entropy->sum - entropy->max_val;
-    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    float min_limit = 2.f * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.f - mix) * entropy->entropy;
    return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
  }
 }

-double VP8LBitsEntropy(const uint32_t* const array, int n) {
+float VP8LBitsEntropy(const uint32_t* const array, int n) {
  VP8LBitEntropy entropy;
  VP8LBitsEntropyUnrefined(array, n, &entropy);

  return BitsEntropyRefine(&entropy);
 }

-static double InitialHuffmanCost(void) {
+static float InitialHuffmanCost(void) {
  // Small bias because Huffman code length is typically not stored in
  // full length.
  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
-  static const double kSmallBias = 9.1;
+  static const float kSmallBias = 9.1f;
  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
 }

 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
-static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+static float FinalHuffmanCost(const VP8LStreaks* const stats) {
  // The constants in this function are experimental and got rounded from
  // their original values in 1/8 when switched to 1/1024.
-  double retval = InitialHuffmanCost();
+  float retval = InitialHuffmanCost();
  // Second coefficient: Many zeros in the histogram are covered efficiently
  // by a run-length encode. Originally 2/8.
-  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  retval += stats->counts[0] * 1.5625f + 0.234375f * stats->streaks[0][1];
  // Second coefficient: Constant values are encoded less efficiently, but still
  // RLE'ed. Originally 6/8.
-  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  retval += stats->counts[1] * 2.578125f + 0.703125f * stats->streaks[1][1];
  // 0s are usually encoded more efficiently than non-0s.
  // Originally 15/8.
-  retval += 1.796875 * stats->streaks[0][0];
+  retval += 1.796875f * stats->streaks[0][0];
  // Originally 26/8.
-  retval += 3.28125 * stats->streaks[1][0];
+  retval += 3.28125f * stats->streaks[1][0];
  return retval;
 }

 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
-static double PopulationCost(const uint32_t* const population, int length,
+static float PopulationCost(const uint32_t* const population, int length,
                            uint32_t* const trivial_sym,
                            uint8_t* const is_used) {
  VP8LBitEntropy bit_entropy;
@ -314,10 +316,9 @@ static double PopulationCost(const uint32_t* const population, int length,

 // trivial_at_end is 1 if the two histograms only have one element that is
 // non-zero: both the zero-th one, or both the last one.
-static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
-                                             const uint32_t* const Y,
-                                             int length, int is_X_used,
-                                             int is_Y_used,
+static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
+                                            const uint32_t* const Y, int length,
+                                            int is_X_used, int is_Y_used,
                                            int trivial_at_end) {
  VP8LStreaks stats;
  if (trivial_at_end) {
@ -356,7 +357,7 @@ static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
 }

 // Estimates the Entropy + Huffman + other block overhead size cost.
-double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
  return
      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
                     NULL, &p->is_used_[0])
@ -373,8 +374,7 @@ double VP8LHistogramEstimateBits(VP8LHistogram* const p) {

 static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                       const VP8LHistogram* const b,
-                                       double cost_threshold,
-                                       double* cost) {
+                                       float cost_threshold, float* cost) {
  const int palette_code_bits = a->palette_code_bits_;
  int trivial_at_end = 0;
  assert(a->palette_code_bits_ == b->palette_code_bits_);
@ -439,12 +439,11 @@ static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
 // Since the previous score passed is 'cost_threshold', we only need to compare
 // the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
 // early.
-static double HistogramAddEval(const VP8LHistogram* const a,
+static float HistogramAddEval(const VP8LHistogram* const a,
                              const VP8LHistogram* const b,
-                               VP8LHistogram* const out,
-                               double cost_threshold) {
-  double cost = 0;
-  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+                              VP8LHistogram* const out, float cost_threshold) {
+  float cost = 0;
+  const float sum_cost = a->bit_cost_ + b->bit_cost_;
  cost_threshold += sum_cost;

  if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
@ -459,10 +458,10 @@ static double HistogramAddEval(const VP8LHistogram* const a,
 // Same as HistogramAddEval(), except that the resulting histogram
 // is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
 // the term C(b) which is constant over all the evaluations.
-static double HistogramAddThresh(const VP8LHistogram* const a,
+static float HistogramAddThresh(const VP8LHistogram* const a,
                                const VP8LHistogram* const b,
-                                 double cost_threshold) {
-  double cost;
+                                float cost_threshold) {
+  float cost;
  assert(a != NULL && b != NULL);
  cost = -a->bit_cost_;
  GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
@ -473,24 +472,22 @@ static double HistogramAddThresh(const VP8LHistogram* const a,

 // The structure to keep track of cost range for the three dominant entropy
 // symbols.
-// TODO(skal): Evaluate if float can be used here instead of double for
-// representing the entropy costs.
 typedef struct {
-  double literal_max_;
-  double literal_min_;
-  double red_max_;
-  double red_min_;
-  double blue_max_;
-  double blue_min_;
+  float literal_max_;
+  float literal_min_;
+  float red_max_;
+  float red_min_;
+  float blue_max_;
+  float blue_min_;
 } DominantCostRange;

 static void DominantCostRangeInit(DominantCostRange* const c) {
  c->literal_max_ = 0.;
-  c->literal_min_ = MAX_COST;
+  c->literal_min_ = MAX_BIT_COST;
  c->red_max_ = 0.;
-  c->red_min_ = MAX_COST;
+  c->red_min_ = MAX_BIT_COST;
  c->blue_max_ = 0.;
-  c->blue_min_ = MAX_COST;
+  c->blue_min_ = MAX_BIT_COST;
 }

 static void UpdateDominantCostRange(
@ -505,10 +502,9 @@ static void UpdateDominantCostRange(

 static void UpdateHistogramCost(VP8LHistogram* const h) {
  uint32_t alpha_sym, red_sym, blue_sym;
-  const double alpha_cost =
-      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
-                     &h->is_used_[3]);
-  const double distance_cost =
+  const float alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
+  const float distance_cost =
      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
      VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
  const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
@ -529,10 +525,10 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
  }
 }

-static int GetBinIdForEntropy(double min, double max, double val) {
-  const double range = max - min;
+static int GetBinIdForEntropy(float min, float max, float val) {
+  const float range = max - min;
  if (range > 0.) {
-    const double delta = val - min;
+    const float delta = val - min;
    return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
  } else {
    return 0;
@ -641,15 +637,11 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,

 // Merges some histograms with same bin_id together if it's advantageous.
 // Sets the remaining histograms to NULL.
-static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
-                                       int* num_used,
-                                       const uint16_t* const clusters,
-                                       uint16_t* const cluster_mappings,
-                                       VP8LHistogram* cur_combo,
-                                       const uint16_t* const bin_map,
-                                       int num_bins,
-                                       double combine_cost_factor,
-                                       int low_effort) {
+static void HistogramCombineEntropyBin(
+    VP8LHistogramSet* const image_histo, int* num_used,
+    const uint16_t* const clusters, uint16_t* const cluster_mappings,
+    VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
+    float combine_cost_factor, int low_effort) {
  VP8LHistogram** const histograms = image_histo->histograms;
  int idx;
  struct {
@ -679,11 +671,10 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
      cluster_mappings[clusters[idx]] = clusters[first];
    } else {
      // try to merge #idx into #first (both share the same bin_id)
-      const double bit_cost = histograms[idx]->bit_cost_;
-      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
-      const double curr_cost_diff =
-          HistogramAddEval(histograms[first], histograms[idx],
-                           cur_combo, bit_cost_thresh);
+      const float bit_cost = histograms[idx]->bit_cost_;
+      const float bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const float curr_cost_diff = HistogramAddEval(
+          histograms[first], histograms[idx], cur_combo, bit_cost_thresh);
      if (curr_cost_diff < bit_cost_thresh) {
        // Try to merge two histograms only if the combo is a trivial one or
        // the two candidate histograms are already non-trivial.
@ -731,8 +722,8 @@ static uint32_t MyRand(uint32_t* const seed) {
 typedef struct {
  int idx1;
  int idx2;
-  double cost_diff;
-  double cost_combo;
+  float cost_diff;
+  float cost_combo;
 } HistogramPair;

 typedef struct {
@ -787,10 +778,9 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
 // Update the cost diff and combo of a pair of histograms. This needs to be
 // called when the the histograms have been merged with a third one.
 static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
-                                 const VP8LHistogram* const h2,
-                                 double threshold,
+                                 const VP8LHistogram* const h2, float threshold,
                                 HistogramPair* const pair) {
-  const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  const float sum_cost = h1->bit_cost_ + h2->bit_cost_;
  pair->cost_combo = 0.;
  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
  pair->cost_diff = pair->cost_combo - sum_cost;
@ -799,9 +789,9 @@ static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
 // Create a pair from indices "idx1" and "idx2" provided its cost
 // is inferior to "threshold", a negative entropy.
 // It returns the cost of the pair, or 0. if it superior to threshold.
-static double HistoQueuePush(HistoQueue* const histo_queue,
+static float HistoQueuePush(HistoQueue* const histo_queue,
                            VP8LHistogram** const histograms, int idx1,
-                             int idx2, double threshold) {
+                            int idx2, float threshold) {
  const VP8LHistogram* h1;
  const VP8LHistogram* h2;
  HistogramPair pair;
@ -945,8 +935,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
           ++tries_with_no_success < num_tries_no_success;
       ++iter) {
    int* mapping_index;
-    double best_cost =
-        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+    float best_cost =
+        (histo_queue.size == 0) ? 0.f : histo_queue.queue[0].cost_diff;
    int best_idx1 = -1, best_idx2 = 1;
    const uint32_t rand_range = (*num_used - 1) * (*num_used);
    // (*num_used) / 2 was chosen empirically. Less means faster but worse
@ -955,7 +945,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,

    // Pick random samples.
    for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
-      double curr_cost;
+      float curr_cost;
      // Choose two different histograms at random and try to combine them.
      const uint32_t tmp = MyRand(&seed) % rand_range;
      uint32_t idx1 = tmp / (*num_used - 1);
@ -1057,7 +1047,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
  if (out_size > 1) {
    for (i = 0; i < in_size; ++i) {
      int best_out = 0;
-      double best_bits = MAX_COST;
+      float best_bits = MAX_BIT_COST;
      int k;
      if (in_histo[i] == NULL) {
        // Arbitrarily set to the previous value if unused to help future LZ77.
@ -1065,7 +1055,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
        continue;
      }
      for (k = 0; k < out_size; ++k) {
-        double cur_bits;
+        float cur_bits;
        cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
        if (k == 0 || cur_bits < best_bits) {
          best_bits = cur_bits;
@ -1093,13 +1083,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
  }
 }

-static double GetCombineCostFactor(int histo_size, int quality) {
-  double combine_cost_factor = 0.16;
+static float GetCombineCostFactor(int histo_size, int quality) {
+  float combine_cost_factor = 0.16f;
  if (quality < 90) {
-    if (histo_size > 256) combine_cost_factor /= 2.;
-    if (histo_size > 512) combine_cost_factor /= 2.;
-    if (histo_size > 1024) combine_cost_factor /= 2.;
-    if (quality <= 50) combine_cost_factor /= 2.;
+    if (histo_size > 256) combine_cost_factor /= 2.f;
+    if (histo_size > 512) combine_cost_factor /= 2.f;
+    if (histo_size > 1024) combine_cost_factor /= 2.f;
+    if (quality <= 50) combine_cost_factor /= 2.f;
  }
  return combine_cost_factor;
 }
@ -1169,13 +1159,13 @@ static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
 }

 int VP8LGetHistoImageSymbols(int xsize, int ysize,
-                             const VP8LBackwardRefs* const refs,
-                             int quality, int low_effort,
-                             int histogram_bits, int cache_bits,
+                             const VP8LBackwardRefs* const refs, int quality,
+                             int low_effort, int histogram_bits, int cache_bits,
                             VP8LHistogramSet* const image_histo,
                             VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols) {
-  int ok = 0;
+                             uint16_t* const histogram_symbols,
+                             const WebPPicture* const pic, int percent_range,
+                             int* const percent) {
  const int histo_xsize =
      histogram_bits ? VP8LSubSampleSize(xsize, histogram_bits) : 1;
  const int histo_ysize =
@ -1192,7 +1182,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
  uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
  int num_used = image_histo_raw_size;
-  if (orig_histo == NULL || map_tmp == NULL) goto Error;
+  if (orig_histo == NULL || map_tmp == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto Error;
+  }

  // Construct the histograms from backward references.
  HistogramBuild(xsize, histogram_bits, refs, orig_histo);
@ -1206,16 +1199,15 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,

  if (entropy_combine) {
    uint16_t* const bin_map = map_tmp;
-    const double combine_cost_factor =
+    const float combine_cost_factor =
        GetCombineCostFactor(image_histo_raw_size, quality);
    const uint32_t num_clusters = num_used;

    HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
    // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
-                               cluster_mappings, tmp_histo, bin_map,
-                               entropy_combine_num_bins, combine_cost_factor,
-                               low_effort);
+    HistogramCombineEntropyBin(
+        image_histo, &num_used, histogram_symbols, cluster_mappings, tmp_histo,
+        bin_map, entropy_combine_num_bins, combine_cost_factor, low_effort);
    OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
                             map_tmp, histogram_symbols);
  }
@ -1229,11 +1221,13 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
    int do_greedy;
    if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
                                    &do_greedy)) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
      goto Error;
    }
    if (do_greedy) {
      RemoveEmptyHistograms(image_histo);
      if (!HistogramCombineGreedy(image_histo, &num_used)) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
        goto Error;
      }
    }
@ -1243,10 +1237,12 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  RemoveEmptyHistograms(image_histo);
  HistogramRemap(orig_histo, image_histo, histogram_symbols);

-  ok = 1;
+  if (!WebPReportProgress(pic, *percent + percent_range, percent)) {
+    goto Error;
+  }

 Error:
  VP8LFreeHistogramSet(orig_histo);
  WebPSafeFree(map_tmp);
-  return ok;
+  return (pic->error_code == VP8_ENC_OK);
 }
--- a/src/enc/histogram_enc.h
+++ b/src/enc/histogram_enc.h
@ -40,10 +40,10 @@ typedef struct {
  int palette_code_bits_;
  uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
                             // literal symbols are single valued.
-  double bit_cost_;          // cached value of bit cost.
-  double literal_cost_;      // Cached values of dominant entropy costs:
-  double red_cost_;          // literal, red & blue.
-  double blue_cost_;
+  float bit_cost_;           // cached value of bit cost.
+  float literal_cost_;       // Cached values of dominant entropy costs:
+  float red_cost_;           // literal, red & blue.
+  float blue_cost_;
  uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
 } VP8LHistogram;

@ -105,21 +105,23 @@ static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
      ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
 }

-// Builds the histogram image.
+// Builds the histogram image. pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
-                             const VP8LBackwardRefs* const refs,
-                             int quality, int low_effort,
-                             int histogram_bits, int cache_bits,
+                             const VP8LBackwardRefs* const refs, int quality,
+                             int low_effort, int histogram_bits, int cache_bits,
                             VP8LHistogramSet* const image_histo,
                             VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols);
+                             uint16_t* const histogram_symbols,
+                             const WebPPicture* const pic, int percent_range,
+                             int* const percent);

 // Returns the entropy for the symbols in the input array.
-double VP8LBitsEntropy(const uint32_t* const array, int n);
+float VP8LBitsEntropy(const uint32_t* const array, int n);

 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
-double VP8LHistogramEstimateBits(VP8LHistogram* const p);
+float VP8LHistogramEstimateBits(VP8LHistogram* const p);

 #ifdef __cplusplus
 }
--- a/src/enc/picture_csp_enc.c
+++ b/src/enc/picture_csp_enc.c
@ -15,12 +15,19 @@
 #include <stdlib.h>
 #include <math.h>

+#include "sharpyuv/sharpyuv.h"
+#include "sharpyuv/sharpyuv_csp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/random_utils.h"
 #include "src/utils/utils.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/yuv.h"
+#include "src/dsp/cpu.h"
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>
+#endif

 // Uncomment to disable gamma-compression during RGB->U/V averaging
 #define USE_GAMMA_COMPRESSION
@ -76,16 +83,16 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {

 #if defined(USE_GAMMA_COMPRESSION)

-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80      // for now we use a different gamma value than kGammaF
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+// Gamma correction compensates loss of resolution during chroma subsampling.
+#define GAMMA_FIX 12      // fixed-point precision for linear values
+#define GAMMA_TAB_FIX 7   // fixed-point fractional bits precision
+#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX))
+static const double kGamma = 0.80;
+static const int kGammaScale = ((1 << GAMMA_FIX) - 1);
+static const int kGammaTabScale = (1 << GAMMA_TAB_FIX);
+static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1);

-static int kLinearToGammaTab[kGammaTabSize + 1];
+static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1];
 static uint16_t kGammaToLinearTab[256];
 static volatile int kGammaTablesOk = 0;
 static void InitGammaTables(void);
@ -93,13 +100,13 @@ static void InitGammaTables(void);
 WEBP_DSP_INIT_FUNC(InitGammaTables) {
  if (!kGammaTablesOk) {
    int v;
-    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale;
    const double norm = 1. / 255.;
    for (v = 0; v <= 255; ++v) {
      kGammaToLinearTab[v] =
          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
+    for (v = 0; v <= GAMMA_TAB_SIZE; ++v) {
      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
    }
    kGammaTablesOk = 1;
@ -111,12 +118,12 @@ static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
 }

 static WEBP_INLINE int Interpolate(int v) {
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int tab_pos = v >> (GAMMA_TAB_FIX + 2);    // integer part
  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
  const int v0 = kLinearToGammaTab[tab_pos];
  const int v1 = kLinearToGammaTab[tab_pos + 1];
  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  assert(tab_pos + 1 < kGammaTabSize + 1);
+  assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1);
  return y;
 }

@ -124,7 +131,7 @@ static WEBP_INLINE int Interpolate(int v) {
 // U/V value, suitable for RGBToU/V calls.
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
  const int y = Interpolate(base_value << shift);   // final uplifted value
-  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+  return (y + kGammaTabRounder) >> GAMMA_TAB_FIX;    // descale
 }

 #else
@ -158,415 +165,41 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 //------------------------------------------------------------------------------
 // Sharp RGB->YUV conversion

-static const int kNumIterations = 4;
 static const int kMinDimensionIterativeConversion = 4;

-// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
-// banding sometimes. Better use extra precision.
-#define SFIX 2                // fixed-point precision of RGB and Y/W
-typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
-typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
-
-#define SHALF (1 << SFIX >> 1)
-#define MAX_Y_T ((256 << SFIX) - 1)
-#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// We use tables of different size and precision for the Rec709 / BT2020
-// transfer function.
-#define kGammaF (1./0.45)
-static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
-#define GAMMA_TO_LINEAR_BITS 14
-static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
-static volatile int kGammaTablesSOk = 0;
-static void InitGammaTablesS(void);
-
-WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
-  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
-  if (!kGammaTablesSOk) {
-    int v;
-    const double norm = 1. / MAX_Y_T;
-    const double scale = 1. / kGammaTabSize;
-    const double a = 0.09929682680944;
-    const double thresh = 0.018053968510807;
-    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
-    for (v = 0; v <= MAX_Y_T; ++v) {
-      const double g = norm * v;
-      double value;
-      if (g <= thresh * 4.5) {
-        value = g / 4.5;
-      } else {
-        const double a_rec = 1. / (1. + a);
-        value = pow(a_rec * (g + a), kGammaF);
-      }
-      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double g = scale * v;
-      double value;
-      if (g <= thresh) {
-        value = 4.5 * g;
-      } else {
-        value = (1. + a) * pow(g, 1. / kGammaF) - a;
-      }
-      // we already incorporate the 1/2 rounding constant here
-      kLinearToGammaTabS[v] =
-          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
-    }
-    // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
-    kGammaTablesSOk = 1;
-  }
-}
-
-// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
-static WEBP_INLINE uint32_t GammaToLinearS(int v) {
-  return kGammaToLinearTabS[v];
-}
-
-static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
-  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
-  const uint32_t v = value * kGammaTabSize;
-  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
-  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
-  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
-  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
-  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
-  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
-  // Final interpolation. Note that rounding is already included.
-  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
-  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
-  return result;
-}
-
-#else
-
-static void InitGammaTablesS(void) {}
-static WEBP_INLINE uint32_t GammaToLinearS(int v) {
-  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
-}
-static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
-  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-static uint8_t clip_8b(fixed_t v) {
-  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
-}
-
-static fixed_y_t clip_y(int y) {
-  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
-}
-
-//------------------------------------------------------------------------------
-
-static int RGBToGray(int r, int g, int b) {
-  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
-  return (luma >> YUV_FIX);
-}
-
-static uint32_t ScaleDown(int a, int b, int c, int d) {
-  const uint32_t A = GammaToLinearS(a);
-  const uint32_t B = GammaToLinearS(b);
-  const uint32_t C = GammaToLinearS(c);
-  const uint32_t D = GammaToLinearS(d);
-  return LinearToGammaS((A + B + C + D + 2) >> 2);
-}
-
-static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
-    const uint32_t R = GammaToLinearS(src[0 * w + i]);
-    const uint32_t G = GammaToLinearS(src[1 * w + i]);
-    const uint32_t B = GammaToLinearS(src[2 * w + i]);
-    const uint32_t Y = RGBToGray(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaS(Y);
-  }
-}
-
-static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
-                         fixed_t* dst, int uv_w) {
-  int i;
-  for (i = 0; i < uv_w; ++i) {
-    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
-                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
-    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
-                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
-    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
-                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
-    const int W = RGBToGray(r, g, b);
-    dst[0 * uv_w] = (fixed_t)(r - W);
-    dst[1 * uv_w] = (fixed_t)(g - W);
-    dst[2 * uv_w] = (fixed_t)(b - W);
-    dst  += 1;
-    src1 += 2;
-    src2 += 2;
-  }
-}
-
-static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
-    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
-  }
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
-  const int v0 = (A * 3 + B + 2) >> 2;
-  return clip_y(v0 + W0);
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
-  return ((fixed_y_t)a << SFIX) | SHALF;
-}
-
-static void ImportOneRow(const uint8_t* const r_ptr,
-                         const uint8_t* const g_ptr,
-                         const uint8_t* const b_ptr,
-                         int step,
-                         int pic_width,
-                         fixed_y_t* const dst) {
-  int i;
-  const int w = (pic_width + 1) & ~1;
-  for (i = 0; i < pic_width; ++i) {
-    const int off = i * step;
-    dst[i + 0 * w] = UpLift(r_ptr[off]);
-    dst[i + 1 * w] = UpLift(g_ptr[off]);
-    dst[i + 2 * w] = UpLift(b_ptr[off]);
-  }
-  if (pic_width & 1) {  // replicate rightmost pixel
-    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
-    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
-    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
-  }
-}
-
-static void InterpolateTwoRows(const fixed_y_t* const best_y,
-                               const fixed_t* prev_uv,
-                               const fixed_t* cur_uv,
-                               const fixed_t* next_uv,
-                               int w,
-                               fixed_y_t* out1,
-                               fixed_y_t* out2) {
-  const int uv_w = w >> 1;
-  const int len = (w - 1) >> 1;   // length to filter
-  int k = 3;
-  while (k-- > 0) {   // process each R/G/B segments in turn
-    // special boundary case for i==0
-    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
-    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
-
-    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
-    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
-
-    // special boundary case for i == w - 1 when w is even
-    if (!(w & 1)) {
-      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
-                            best_y[w - 1 + 0]);
-      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
-                            best_y[w - 1 + w]);
-    }
-    out1 += w;
-    out2 += w;
-    prev_uv += uv_w;
-    cur_uv  += uv_w;
-    next_uv += uv_w;
-  }
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
-  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
-  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
-  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
-  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
-  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
-  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
-}
-
-static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
-                            WebPPicture* const picture) {
-  int i, j;
-  uint8_t* dst_y = picture->y;
-  uint8_t* dst_u = picture->u;
-  uint8_t* dst_v = picture->v;
-  const fixed_t* const best_uv_base = best_uv;
-  const int w = (picture->width + 1) & ~1;
-  const int h = (picture->height + 1) & ~1;
-  const int uv_w = w >> 1;
-  const int uv_h = h >> 1;
-  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
-    for (i = 0; i < picture->width; ++i) {
-      const int off = (i >> 1);
-      const int W = best_y[i];
-      const int r = best_uv[off + 0 * uv_w] + W;
-      const int g = best_uv[off + 1 * uv_w] + W;
-      const int b = best_uv[off + 2 * uv_w] + W;
-      dst_y[i] = ConvertRGBToY(r, g, b);
-    }
-    best_y += w;
-    best_uv += (j & 1) * 3 * uv_w;
-    dst_y += picture->y_stride;
-  }
-  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
-    for (i = 0; i < uv_w; ++i) {
-      const int off = i;
-      const int r = best_uv[off + 0 * uv_w];
-      const int g = best_uv[off + 1 * uv_w];
-      const int b = best_uv[off + 2 * uv_w];
-      dst_u[i] = ConvertRGBToU(r, g, b);
-      dst_v[i] = ConvertRGBToV(r, g, b);
-    }
-    best_uv += 3 * uv_w;
-    dst_u += picture->uv_stride;
-    dst_v += picture->uv_stride;
-  }
-  return 1;
-}
-
 //------------------------------------------------------------------------------
 // Main function

-#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
+
+static void SafeInitSharpYuv(void) {
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+  static pthread_mutex_t initsharpyuv_lock = PTHREAD_MUTEX_INITIALIZER;
+  if (pthread_mutex_lock(&initsharpyuv_lock)) return;
+#endif
+
+  SharpYuvInit(VP8GetCPUInfo);
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+  (void)pthread_mutex_unlock(&initsharpyuv_lock);
+#endif
+}

 static int PreprocessARGB(const uint8_t* r_ptr,
                          const uint8_t* g_ptr,
                          const uint8_t* b_ptr,
                          int step, int rgb_stride,
                          WebPPicture* const picture) {
-  // we expand the right/bottom border if needed
-  const int w = (picture->width + 1) & ~1;
-  const int h = (picture->height + 1) & ~1;
-  const int uv_w = w >> 1;
-  const int uv_h = h >> 1;
-  uint64_t prev_diff_y_sum = ~0;
-  int j, iter;
-
-  // TODO(skal): allocate one big memory chunk. But for now, it's easier
-  // for valgrind debugging to have several chunks.
-  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
-  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
-  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
-  fixed_y_t* best_y = best_y_base;
-  fixed_y_t* target_y = target_y_base;
-  fixed_t* best_uv = best_uv_base;
-  fixed_t* target_uv = target_uv_base;
-  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
-  int ok;
-
-  if (best_y_base == NULL || best_uv_base == NULL ||
-      target_y_base == NULL || target_uv_base == NULL ||
-      best_rgb_y == NULL || best_rgb_uv == NULL ||
-      tmp_buffer == NULL) {
-    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-    goto End;
+  const int ok = SharpYuvConvert(
+      r_ptr, g_ptr, b_ptr, step, rgb_stride, /*rgb_bit_depth=*/8,
+      picture->y, picture->y_stride, picture->u, picture->uv_stride, picture->v,
+      picture->uv_stride, /*yuv_bit_depth=*/8, picture->width,
+      picture->height, SharpYuvGetConversionMatrix(kSharpYuvMatrixWebp));
+  if (!ok) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
  }
-  assert(picture->width >= kMinDimensionIterativeConversion);
-  assert(picture->height >= kMinDimensionIterativeConversion);
-
-  WebPInitConvertARGBToYUV();
-
-  // Import RGB samples to W/RGB representation.
-  for (j = 0; j < picture->height; j += 2) {
-    const int is_last_row = (j == picture->height - 1);
-    fixed_y_t* const src1 = tmp_buffer + 0 * w;
-    fixed_y_t* const src2 = tmp_buffer + 3 * w;
-
-    // prepare two rows of input
-    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
-    if (!is_last_row) {
-      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
-                   step, picture->width, src2);
-    } else {
-      memcpy(src2, src1, 3 * w * sizeof(*src2));
-    }
-    StoreGray(src1, best_y + 0, w);
-    StoreGray(src2, best_y + w, w);
-
-    UpdateW(src1, target_y, w);
-    UpdateW(src2, target_y + w, w);
-    UpdateChroma(src1, src2, target_uv, uv_w);
-    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
-    best_y += 2 * w;
-    best_uv += 3 * uv_w;
-    target_y += 2 * w;
-    target_uv += 3 * uv_w;
-    r_ptr += 2 * rgb_stride;
-    g_ptr += 2 * rgb_stride;
-    b_ptr += 2 * rgb_stride;
-  }
-
-  // Iterate and resolve clipping conflicts.
-  for (iter = 0; iter < kNumIterations; ++iter) {
-    const fixed_t* cur_uv = best_uv_base;
-    const fixed_t* prev_uv = best_uv_base;
-    uint64_t diff_y_sum = 0;
-
-    best_y = best_y_base;
-    best_uv = best_uv_base;
-    target_y = target_y_base;
-    target_uv = target_uv_base;
-    for (j = 0; j < h; j += 2) {
-      fixed_y_t* const src1 = tmp_buffer + 0 * w;
-      fixed_y_t* const src2 = tmp_buffer + 3 * w;
-      {
-        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
-        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
-        prev_uv = cur_uv;
-        cur_uv = next_uv;
-      }
-
-      UpdateW(src1, best_rgb_y + 0 * w, w);
-      UpdateW(src2, best_rgb_y + 1 * w, w);
-      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
-
-      // update two rows of Y and one row of RGB
-      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
-      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
-
-      best_y += 2 * w;
-      best_uv += 3 * uv_w;
-      target_y += 2 * w;
-      target_uv += 3 * uv_w;
-    }
-    // test exit condition
-    if (iter > 0) {
-      if (diff_y_sum < diff_y_threshold) break;
-      if (diff_y_sum > prev_diff_y_sum) break;
-    }
-    prev_diff_y_sum = diff_y_sum;
-  }
-  // final reconstruction
-  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
-
- End:
-  WebPSafeFree(best_y_base);
-  WebPSafeFree(best_uv_base);
-  WebPSafeFree(target_y_base);
-  WebPSafeFree(target_uv_base);
-  WebPSafeFree(best_rgb_y);
-  WebPSafeFree(best_rgb_uv);
-  WebPSafeFree(tmp_buffer);
  return ok;
 }
-#undef SAFE_ALLOC

 //------------------------------------------------------------------------------
 // "Fast" regular RGB->YUV
@ -591,8 +224,8 @@ static const int kAlphaFix = 19;
 // and constant are adjusted very tightly to fit 32b arithmetic.
 // In particular, they use the fact that the operands for 'v / a' are actually
 // derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
-// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
-// overflow is: kGammaFix + kAlphaFix <= 31.
+// with ai in [0..255] and pi in [0..1<<GAMMA_FIX). The constraint to avoid
+// overflow is: GAMMA_FIX + kAlphaFix <= 31.
 static const uint32_t kInvAlpha[4 * 0xff + 1] = {
  0,  /* alpha = 0 */
  524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
@ -818,11 +451,20 @@ static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
    dst[0] = SUM4(r_ptr + j, step);
    dst[1] = SUM4(g_ptr + j, step);
    dst[2] = SUM4(b_ptr + j, step);
+    // MemorySanitizer may raise false positives with data that passes through
+    // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles.
+    // See https://crbug.com/webp/573.
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
  }
  if (width & 1) {
    dst[0] = SUM2(r_ptr + j);
    dst[1] = SUM2(g_ptr + j);
    dst[2] = SUM2(b_ptr + j);
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
  }
 }

@ -863,18 +505,18 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
    use_iterative_conversion = 0;
  }

-  if (!WebPPictureAllocYUVA(picture, width, height)) {
+  if (!WebPPictureAllocYUVA(picture)) {
    return 0;
  }
  if (has_alpha) {
    assert(step == 4);
 #if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
-    assert(kAlphaFix + kGammaFix <= 31);
+    assert(kAlphaFix + GAMMA_FIX <= 31);
 #endif
  }

  if (use_iterative_conversion) {
-    InitGammaTablesS();
+    SafeInitSharpYuv();
    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
      return 0;
    }
@ -1044,7 +686,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
  }
  // Allocate a new argb buffer (discarding the previous one).
-  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  if (!WebPPictureAllocARGB(picture)) return 0;
  picture->use_argb = 1;

  // Convert
@ -1106,6 +748,8 @@ static int Import(WebPPicture* const picture,
  const int width = picture->width;
  const int height = picture->height;

+  if (abs(rgb_stride) < (import_alpha ? 4 : 3) * width) return 0;
+
  if (!picture->use_argb) {
    const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
@ -1163,24 +807,24 @@ static int Import(WebPPicture* const picture,
 #if !defined(WEBP_REDUCE_CSP)

 int WebPPictureImportBGR(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL && rgb != NULL)
-             ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+                         const uint8_t* bgr, int bgr_stride) {
+  return (picture != NULL && bgr != NULL)
+             ? Import(picture, bgr, bgr_stride, 3, 1, 0)
             : 0;
 }

 int WebPPictureImportBGRA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+                          const uint8_t* bgra, int bgra_stride) {
+  return (picture != NULL && bgra != NULL)
+             ? Import(picture, bgra, bgra_stride, 4, 1, 1)
             : 0;
 }


 int WebPPictureImportBGRX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+                          const uint8_t* bgrx, int bgrx_stride) {
+  return (picture != NULL && bgrx != NULL)
+             ? Import(picture, bgrx, bgrx_stride, 4, 1, 0)
             : 0;
 }

@ -1201,9 +845,9 @@ int WebPPictureImportRGBA(WebPPicture* picture,
 }

 int WebPPictureImportRGBX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+                          const uint8_t* rgbx, int rgbx_stride) {
+  return (picture != NULL && rgbx != NULL)
+             ? Import(picture, rgbx, rgbx_stride, 4, 0, 0)
             : 0;
 }

--- a/src/enc/picture_enc.c
+++ b/src/enc/picture_enc.c
@ -45,6 +45,22 @@ int WebPPictureInitInternal(WebPPicture* picture, int version) {

 //------------------------------------------------------------------------------

+int WebPValidatePicture(const WebPPicture* const picture) {
+  if (picture == NULL) return 0;
+  if (picture->width <= 0 || picture->height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (picture->width <= 0 || picture->width / 4 > INT_MAX / 4 ||
+      picture->height <= 0 || picture->height / 4 > INT_MAX / 4) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (picture->colorspace != WEBP_YUV420 &&
+      picture->colorspace != WEBP_YUV420A) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  return 1;
+}
+
 static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
  picture->memory_argb_ = NULL;
  picture->argb = NULL;
@ -63,18 +79,17 @@ void WebPPictureResetBuffers(WebPPicture* const picture) {
  WebPPictureResetBufferYUVA(picture);
 }

-int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+int WebPPictureAllocARGB(WebPPicture* const picture) {
  void* memory;
+  const int width = picture->width;
+  const int height = picture->height;
  const uint64_t argb_size = (uint64_t)width * height;

-  assert(picture != NULL);
+  if (!WebPValidatePicture(picture)) return 0;

  WebPSafeFree(picture->memory_argb_);
  WebPPictureResetBufferARGB(picture);

-  if (width <= 0 || height <= 0) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
-  }
  // allocate a new buffer.
  memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
  if (memory == NULL) {
@ -86,10 +101,10 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
  return 1;
 }

-int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
-  const WebPEncCSP uv_csp =
-      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+int WebPPictureAllocYUVA(WebPPicture* const picture) {
  const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int width = picture->width;
+  const int height = picture->height;
  const int y_stride = width;
  const int uv_width = (int)(((int64_t)width + 1) >> 1);
  const int uv_height = (int)(((int64_t)height + 1) >> 1);
@ -98,15 +113,11 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
  uint64_t y_size, uv_size, a_size, total_size;
  uint8_t* mem;

-  assert(picture != NULL);
+  if (!WebPValidatePicture(picture)) return 0;

  WebPSafeFree(picture->memory_);
  WebPPictureResetBufferYUVA(picture);

-  if (uv_csp != WEBP_YUV420) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  }
-
  // alpha
  a_width = has_alpha ? width : 0;
  a_stride = a_width;
@ -152,15 +163,12 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {

 int WebPPictureAlloc(WebPPicture* picture) {
  if (picture != NULL) {
-    const int width = picture->width;
-    const int height = picture->height;
-
    WebPPictureFree(picture);   // erase previous buffer

    if (!picture->use_argb) {
-      return WebPPictureAllocYUVA(picture, width, height);
+      return WebPPictureAllocYUVA(picture);
    } else {
-      return WebPPictureAllocARGB(picture, width, height);
+      return WebPPictureAllocARGB(picture);
    }
  }
  return 1;
--- a/src/enc/picture_rescale_enc.c
+++ b/src/enc/picture_rescale_enc.c
@ -13,14 +13,15 @@

 #include "src/webp/encode.h"

-#if !defined(WEBP_REDUCE_SIZE)
-
 #include <assert.h>
 #include <stdlib.h>

 #include "src/enc/vp8i_enc.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
 #include "src/utils/rescaler_utils.h"
 #include "src/utils/utils.h"
+#endif  // !defined(WEBP_REDUCE_SIZE)

 #define HALVE(x) (((x) + 1) >> 1)

@ -56,6 +57,7 @@ static int AdjustAndCheckRectangle(const WebPPicture* const pic,
  return 1;
 }

+#if !defined(WEBP_REDUCE_SIZE)
 int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
  if (src == NULL || dst == NULL) return 0;
  if (src == dst) return 1;
@ -81,6 +83,7 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
  }
  return 1;
 }
+#endif  // !defined(WEBP_REDUCE_SIZE)

 int WebPPictureIsView(const WebPPicture* picture) {
  if (picture == NULL) return 0;
@ -120,6 +123,7 @@ int WebPPictureView(const WebPPicture* src,
  return 1;
 }

+#if !defined(WEBP_REDUCE_SIZE)
 //------------------------------------------------------------------------------
 // Picture cropping

@ -198,34 +202,34 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
  }
 }

-int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+int WebPPictureRescale(WebPPicture* picture, int width, int height) {
  WebPPicture tmp;
  int prev_width, prev_height;
  rescaler_t* work;

-  if (pic == NULL) return 0;
-  prev_width = pic->width;
-  prev_height = pic->height;
+  if (picture == NULL) return 0;
+  prev_width = picture->width;
+  prev_height = picture->height;
  if (!WebPRescalerGetScaledDimensions(
          prev_width, prev_height, &width, &height)) {
    return 0;
  }

-  PictureGrabSpecs(pic, &tmp);
+  PictureGrabSpecs(picture, &tmp);
  tmp.width = width;
  tmp.height = height;
  if (!WebPPictureAlloc(&tmp)) return 0;

-  if (!pic->use_argb) {
+  if (!picture->use_argb) {
    work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
    }
    // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
+    if (picture->a != NULL) {
      WebPInitAlphaProcessing();
-      if (!RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+      if (!RescalePlane(picture->a, prev_width, prev_height, picture->a_stride,
                        tmp.a, width, height, tmp.a_stride, work, 1)) {
        return 0;
      }
@ -233,17 +237,15 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {

    // We take transparency into account on the luma plane only. That's not
    // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
-    if (!RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+    AlphaMultiplyY(picture, 0);
+    if (!RescalePlane(picture->y, prev_width, prev_height, picture->y_stride,
                      tmp.y, width, height, tmp.y_stride, work, 1) ||
-        !RescalePlane(pic->u,
-                      HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                      tmp.u,
-                      HALVE(width), HALVE(height), tmp.uv_stride, work, 1) ||
-        !RescalePlane(pic->v,
-                      HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                      tmp.v,
-                      HALVE(width), HALVE(height), tmp.uv_stride, work, 1)) {
+        !RescalePlane(picture->u, HALVE(prev_width), HALVE(prev_height),
+                      picture->uv_stride, tmp.u, HALVE(width), HALVE(height),
+                      tmp.uv_stride, work, 1) ||
+        !RescalePlane(picture->v, HALVE(prev_width), HALVE(prev_height),
+                      picture->uv_stride, tmp.v, HALVE(width), HALVE(height),
+                      tmp.uv_stride, work, 1)) {
      return 0;
    }
    AlphaMultiplyY(&tmp, 1);
@ -257,18 +259,17 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
    // weighting first (black-matting), scale the RGB values, and remove
    // the premultiplication afterward (while preserving the alpha channel).
    WebPInitAlphaProcessing();
-    AlphaMultiplyARGB(pic, 0);
-    if (!RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
-                      pic->argb_stride * 4,
-                      (uint8_t*)tmp.argb, width, height,
-                      tmp.argb_stride * 4, work, 4)) {
+    AlphaMultiplyARGB(picture, 0);
+    if (!RescalePlane((const uint8_t*)picture->argb, prev_width, prev_height,
+                      picture->argb_stride * 4, (uint8_t*)tmp.argb, width,
+                      height, tmp.argb_stride * 4, work, 4)) {
      return 0;
    }
    AlphaMultiplyARGB(&tmp, 1);
  }
-  WebPPictureFree(pic);
+  WebPPictureFree(picture);
  WebPSafeFree(work);
-  *pic = tmp;
+  *picture = tmp;
  return 1;
 }

@ -280,23 +281,6 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
  return 0;
 }

-int WebPPictureIsView(const WebPPicture* picture) {
-  (void)picture;
-  return 0;
-}
-
-int WebPPictureView(const WebPPicture* src,
-                    int left, int top, int width, int height,
-                    WebPPicture* dst) {
-  (void)src;
-  (void)left;
-  (void)top;
-  (void)width;
-  (void)height;
-  (void)dst;
-  return 0;
-}
-
 int WebPPictureCrop(WebPPicture* pic,
                    int left, int top, int width, int height) {
  (void)pic;
--- a/src/enc/picture_tools_enc.c
+++ b/src/enc/picture_tools_enc.c
@ -190,27 +190,28 @@ static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
  return (0xff000000u | (r << 16) | (g << 8) | b);
 }

-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+void WebPBlendAlpha(WebPPicture* picture, uint32_t background_rgb) {
  const int red = (background_rgb >> 16) & 0xff;
  const int green = (background_rgb >> 8) & 0xff;
  const int blue = (background_rgb >> 0) & 0xff;
  int x, y;
-  if (pic == NULL) return;
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+  if (picture == NULL) return;
+  if (!picture->use_argb) {
+    // omit last pixel during u/v loop
+    const int uv_width = (picture->width >> 1);
    const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
    // VP8RGBToU/V expects the u/v values summed over four pixels
    const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
    const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    uint8_t* y_ptr = pic->y;
-    uint8_t* u_ptr = pic->u;
-    uint8_t* v_ptr = pic->v;
-    uint8_t* a_ptr = pic->a;
+    const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+    uint8_t* y_ptr = picture->y;
+    uint8_t* u_ptr = picture->u;
+    uint8_t* v_ptr = picture->v;
+    uint8_t* a_ptr = picture->a;
    if (!has_alpha || a_ptr == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
+    for (y = 0; y < picture->height; ++y) {
      // Luma blending
-      for (x = 0; x < pic->width; ++x) {
+      for (x = 0; x < picture->width; ++x) {
        const uint8_t alpha = a_ptr[x];
        if (alpha < 0xff) {
          y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
@ -219,7 +220,7 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
      // Chroma blending every even line
      if ((y & 1) == 0) {
        uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+            (y + 1 == picture->height) ? a_ptr : a_ptr + picture->a_stride;
        for (x = 0; x < uv_width; ++x) {
          // Average four alpha values into a single blending weight.
          // TODO(skal): might lead to visible contouring. Can we do better?
@ -229,24 +230,24 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
        }
-        if (pic->width & 1) {   // rightmost pixel
+        if (picture->width & 1) {  // rightmost pixel
          const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
        }
      } else {
-        u_ptr += pic->uv_stride;
-        v_ptr += pic->uv_stride;
+        u_ptr += picture->uv_stride;
+        v_ptr += picture->uv_stride;
      }
-      memset(a_ptr, 0xff, pic->width);  // reset alpha value to opaque
-      a_ptr += pic->a_stride;
-      y_ptr += pic->y_stride;
+      memset(a_ptr, 0xff, picture->width);  // reset alpha value to opaque
+      a_ptr += picture->a_stride;
+      y_ptr += picture->y_stride;
    }
  } else {
-    uint32_t* argb = pic->argb;
+    uint32_t* argb = picture->argb;
    const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
+    for (y = 0; y < picture->height; ++y) {
+      for (x = 0; x < picture->width; ++x) {
        const int alpha = (argb[x] >> 24) & 0xff;
        if (alpha != 0xff) {
          if (alpha > 0) {
@ -262,7 +263,7 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
          }
        }
      }
-      argb += pic->argb_stride;
+      argb += picture->argb_stride;
    }
  }
 }
--- a/src/enc/predictor_enc.c
+++ b/src/enc/predictor_enc.c
@ -16,6 +16,7 @@

 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8li_enc.h"

 #define MAX_DIFF_COST (1e30f)
@ -31,10 +32,10 @@ static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
 // Methods to calculate Entropy (Shannon).

 static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   double exp_val) {
+                                   float exp_val) {
  const int significant_symbols = 256 >> 4;
-  const double exp_decay_factor = 0.6;
-  double bits = weight_0 * counts[0];
+  const float exp_decay_factor = 0.6f;
+  float bits = (float)weight_0 * counts[0];
  int i;
  for (i = 1; i < significant_symbols; ++i) {
    bits += exp_val * (counts[i] + counts[256 - i]);
@ -46,9 +47,9 @@ static float PredictionCostSpatial(const int counts[256], int weight_0,
 static float PredictionCostSpatialHistogram(const int accumulated[4][256],
                                            const int tile[4][256]) {
  int i;
-  double retval = 0;
+  float retval = 0.f;
  for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
+    const float kExpValue = 0.94f;
    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
  }
@ -472,12 +473,15 @@ static void CopyImageWithPrediction(int width, int height,
 // with respect to predictions. If near_lossless_quality < 100, applies
 // near lossless processing, shaving off more bits of residuals for lower
 // qualities.
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
+int VP8LResidualImage(int width, int height, int bits, int low_effort,
                      uint32_t* const argb, uint32_t* const argb_scratch,
                      uint32_t* const image, int near_lossless_quality,
-                       int exact, int used_subtract_green) {
+                      int exact, int used_subtract_green,
+                      const WebPPicture* const pic, int percent_range,
+                      int* const percent) {
  const int tiles_per_row = VP8LSubSampleSize(width, bits);
  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int percent_start = *percent;
  int tile_y;
  int histo[4][256];
  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
@ -491,17 +495,24 @@ void VP8LResidualImage(int width, int height, int bits, int low_effort,
    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
      int tile_x;
      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
-            bits, histo, argb_scratch, argb, max_quantization, exact,
-            used_subtract_green, image);
+        const int pred = GetBestPredictorForTile(
+            width, height, tile_x, tile_y, bits, histo, argb_scratch, argb,
+            max_quantization, exact, used_subtract_green, image);
        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
      }
+
+      if (!WebPReportProgress(
+              pic, percent_start + percent_range * tile_y / tiles_per_col,
+              percent)) {
+        return 0;
+      }
    }
  }

  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
                          low_effort, max_quantization, exact,
                          used_subtract_green);
+  return WebPReportProgress(pic, percent_start + percent_range, percent);
 }

 //------------------------------------------------------------------------------
@ -532,7 +543,7 @@ static float PredictionCostCrossColor(const int accumulated[256],
                                      const int counts[256]) {
  // Favor low entropy, locally and globally.
  // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
+  static const float kExpValue = 2.4f;
  return VP8LCombinedShannonEntropy(counts, accumulated) +
         PredictionCostSpatial(counts, 3, kExpValue);
 }
@ -714,11 +725,14 @@ static void CopyTileWithColorTransform(int xsize, int ysize,
  }
 }

-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image) {
+int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                            uint32_t* const argb, uint32_t* image,
+                            const WebPPicture* const pic, int percent_range,
+                            int* const percent) {
  const int max_tile_size = 1 << bits;
  const int tile_xsize = VP8LSubSampleSize(width, bits);
  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int percent_start = *percent;
  int accumulated_red_histo[256] = { 0 };
  int accumulated_blue_histo[256] = { 0 };
  int tile_x, tile_y;
@ -768,5 +782,11 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
        }
      }
    }
+    if (!WebPReportProgress(
+            pic, percent_start + percent_range * tile_y / tile_ysize,
+            percent)) {
+      return 0;
    }
  }
+  return 1;
+}
--- a/src/enc/quant_enc.c
+++ b/src/enc/quant_enc.c
@ -533,7 +533,8 @@ static void InitScore(VP8ModeScore* const rd) {
  rd->score = MAX_COST;
 }

-static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
+                      const VP8ModeScore* WEBP_RESTRICT const src) {
  dst->D  = src->D;
  dst->SD = src->SD;
  dst->R  = src->R;
@ -542,7 +543,8 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
  dst->score = src->score;
 }

-static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
+                     const VP8ModeScore* WEBP_RESTRICT const src) {
  dst->D  += src->D;
  dst->SD += src->SD;
  dst->R  += src->R;
@ -588,10 +590,10 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
 // Coefficient type.
 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };

-static int TrellisQuantizeBlock(const VP8Encoder* const enc,
+static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
                                int16_t in[16], int16_t out[16],
                                int ctx0, int coeff_type,
-                                const VP8Matrix* const mtx,
+                                const VP8Matrix* WEBP_RESTRICT const mtx,
                                int lambda) {
  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
  CostArrayPtr const costs =
@ -767,9 +769,9 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 // all at once. Output is the reconstructed block in *yuv_out, and the
 // quantized levels in *levels.

-static int ReconstructIntra16(VP8EncIterator* const it,
-                              VP8ModeScore* const rd,
-                              uint8_t* const yuv_out,
+static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+                              VP8ModeScore* WEBP_RESTRICT const rd,
+                              uint8_t* WEBP_RESTRICT const yuv_out,
                              int mode) {
  const VP8Encoder* const enc = it->enc_;
  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
@ -819,10 +821,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
  return nz;
 }

-static int ReconstructIntra4(VP8EncIterator* const it,
+static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
                             int16_t levels[16],
-                             const uint8_t* const src,
-                             uint8_t* const yuv_out,
+                             const uint8_t* WEBP_RESTRICT const src,
+                             uint8_t* WEBP_RESTRICT const yuv_out,
                             int mode) {
  const VP8Encoder* const enc = it->enc_;
  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
@ -855,7 +857,8 @@ static int ReconstructIntra4(VP8EncIterator* const it,

 // Quantize as usual, but also compute and return the quantization error.
 // Error is already divided by DSHIFT.
-static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
+                          const VP8Matrix* WEBP_RESTRICT const mtx) {
  int V = *v;
  const int sign = (V < 0);
  if (sign) V = -V;
@ -869,9 +872,10 @@ static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
  return (sign ? -V : V) >> DSCALE;
 }

-static void CorrectDCValues(const VP8EncIterator* const it,
-                            const VP8Matrix* const mtx,
-                            int16_t tmp[][16], VP8ModeScore* const rd) {
+static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
+                            const VP8Matrix* WEBP_RESTRICT const mtx,
+                            int16_t tmp[][16],
+                            VP8ModeScore* WEBP_RESTRICT const rd) {
  //         | top[0] | top[1]
  // --------+--------+---------
  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
@ -902,8 +906,8 @@ static void CorrectDCValues(const VP8EncIterator* const it,
  }
 }

-static void StoreDiffusionErrors(VP8EncIterator* const it,
-                                 const VP8ModeScore* const rd) {
+static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
+                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
  int ch;
  for (ch = 0; ch <= 1; ++ch) {
    int8_t* const top = it->top_derr_[it->x_][ch];
@ -922,8 +926,9 @@ static void StoreDiffusionErrors(VP8EncIterator* const it,

 //------------------------------------------------------------------------------

-static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
-                         uint8_t* const yuv_out, int mode) {
+static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
+                         VP8ModeScore* WEBP_RESTRICT const rd,
+                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
  const VP8Encoder* const enc = it->enc_;
  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
@ -994,7 +999,8 @@ static void SwapOut(VP8EncIterator* const it) {
  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }

-static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
+static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+                            VP8ModeScore* WEBP_RESTRICT rd) {
  const int kNumBlocks = 16;
  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_i16_;
@ -1054,7 +1060,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
 //------------------------------------------------------------------------------

 // return the cost array corresponding to the surrounding prediction modes.
-static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
+static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
                                     const uint8_t modes[16]) {
  const int preds_w = it->enc_->preds_w_;
  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
@ -1063,7 +1069,8 @@ static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
  return VP8FixedCostsI4[top][left];
 }

-static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+                          VP8ModeScore* WEBP_RESTRICT const rd) {
  const VP8Encoder* const enc = it->enc_;
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_i4_;
@ -1159,7 +1166,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {

 //------------------------------------------------------------------------------

-static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
+                       VP8ModeScore* WEBP_RESTRICT const rd) {
  const int kNumBlocks = 8;
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_uv_;
@ -1211,7 +1219,8 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 // Final reconstruction and quantization.

-static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
+                           VP8ModeScore* WEBP_RESTRICT const rd) {
  const VP8Encoder* const enc = it->enc_;
  const int is_i16 = (it->mb_->type_ == 1);
  int nz = 0;
@ -1236,9 +1245,9 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
 }

 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void RefineUsingDistortion(VP8EncIterator* const it,
+static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
                                  int try_both_modes, int refine_uv_mode,
-                                  VP8ModeScore* const rd) {
+                                  VP8ModeScore* WEBP_RESTRICT const rd) {
  score_t best_score = MAX_COST;
  int nz = 0;
  int mode;
@ -1352,7 +1361,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // Entry point

-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
+                VP8ModeScore* WEBP_RESTRICT const rd,
                VP8RDLevel rd_opt) {
  int is_skipped;
  const int method = it->enc_->method_;
--- a/src/enc/vp8i_enc.h
+++ b/src/enc/vp8i_enc.h
@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 1
 #define ENC_MIN_VERSION 2
-#define ENC_REV_VERSION 2
+#define ENC_REV_VERSION 3

 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
       MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@ -470,7 +470,8 @@ int VP8EncAnalyze(VP8Encoder* const enc);
 // Sets up segment's quantization values, base_quant_ and filter strengths.
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
+                VP8ModeScore* WEBP_RESTRICT const rd,
                VP8RDLevel rd_opt);

  // in alpha.c
@ -490,19 +491,24 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta);

  // misc utils for picture_*.c:

+// Returns true if 'picture' is non-NULL and dimensions/colorspace are within
+// their valid ranges. If returning false, the 'error_code' in 'picture' is
+// updated.
+int WebPValidatePicture(const WebPPicture* const picture);
+
 // Remove reference to the ARGB/YUVA buffer (doesn't free anything).
 void WebPPictureResetBuffers(WebPPicture* const picture);

-// Allocates ARGB buffer of given dimension (previous one is always free'd).
-// Preserves the YUV(A) buffer. Returns false in case of error (invalid param,
-// out-of-memory).
-int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
+// Allocates ARGB buffer according to set width/height (previous one is
+// always free'd). Preserves the YUV(A) buffer. Returns false in case of error
+// (invalid param, out-of-memory).
+int WebPPictureAllocARGB(WebPPicture* const picture);

-// Allocates YUVA buffer of given dimension (previous one is always free'd).
-// Uses picture->csp to determine whether an alpha buffer is needed.
+// Allocates YUVA buffer according to set width/height (previous one is always
+// free'd). Uses picture->csp to determine whether an alpha buffer is needed.
 // Preserves the ARGB buffer.
 // Returns false in case of error (invalid param, out-of-memory).
-int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
+int WebPPictureAllocYUVA(WebPPicture* const picture);

 // Replace samples that are fully transparent by 'color' to help compressibility
 // (no guarantee, though). Assumes pic->use_argb is true.
--- a/src/enc/vp8l_enc.c
+++ b/src/enc/vp8l_enc.c
--- a/src/enc/vp8li_enc.h
+++ b/src/enc/vp8li_enc.h
@ -89,9 +89,10 @@ int VP8LEncodeImage(const WebPConfig* const config,

 // Encodes the main image stream using the supplied bit writer.
 // If 'use_cache' is false, disables the use of color cache.
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw, int use_cache);
+// Returns false in case of error (stored in picture->error_code).
+int VP8LEncodeStream(const WebPConfig* const config,
+                     const WebPPicture* const picture, VP8LBitWriter* const bw,
+                     int use_cache);

 #if (WEBP_NEAR_LOSSLESS == 1)
 // in near_lossless.c
@ -103,13 +104,18 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
 //------------------------------------------------------------------------------
 // Image transforms in predictor.c.

-void VP8LResidualImage(int width, int height, int bits, int low_effort,
+// pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
+int VP8LResidualImage(int width, int height, int bits, int low_effort,
                      uint32_t* const argb, uint32_t* const argb_scratch,
                      uint32_t* const image, int near_lossless, int exact,
-                       int used_subtract_green);
+                      int used_subtract_green, const WebPPicture* const pic,
+                      int percent_range, int* const percent);

-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image);
+int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                            uint32_t* const argb, uint32_t* image,
+                            const WebPPicture* const pic, int percent_range,
+                            int* const percent);

 //------------------------------------------------------------------------------

--- a/src/enc/webp_enc.c
+++ b/src/enc/webp_enc.c
@ -336,9 +336,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
  if (!WebPValidateConfig(config)) {
    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
  }
-  if (pic->width <= 0 || pic->height <= 0) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  }
+  if (!WebPValidatePicture(pic)) return 0;
  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
  }
--- a/src/libwebp.rc
+++ b/src/libwebp.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,0,2,2
- PRODUCTVERSION 1,0,2,2
+ FILEVERSION 1,0,2,3
+ PRODUCTVERSION 1,0,2,3
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebp DLL"
-            VALUE "FileVersion", "1.2.2"
+            VALUE "FileVersion", "1.2.3"
            VALUE "InternalName", "libwebp.dll"
-            VALUE "LegalCopyright", "Copyright (C) 2021"
+            VALUE "LegalCopyright", "Copyright (C) 2022"
            VALUE "OriginalFilename", "libwebp.dll"
            VALUE "ProductName", "WebP Image Codec"
-            VALUE "ProductVersion", "1.2.2"
+            VALUE "ProductVersion", "1.2.3"
        END
    END
    BLOCK "VarFileInfo"
--- a/src/libwebpdecoder.rc
+++ b/src/libwebpdecoder.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,0,2,2
- PRODUCTVERSION 1,0,2,2
+ FILEVERSION 1,0,2,3
+ PRODUCTVERSION 1,0,2,3
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebpdecoder DLL"
-            VALUE "FileVersion", "1.2.2"
+            VALUE "FileVersion", "1.2.3"
            VALUE "InternalName", "libwebpdecoder.dll"
-            VALUE "LegalCopyright", "Copyright (C) 2021"
+            VALUE "LegalCopyright", "Copyright (C) 2022"
            VALUE "OriginalFilename", "libwebpdecoder.dll"
            VALUE "ProductName", "WebP Image Decoder"
-            VALUE "ProductVersion", "1.2.2"
+            VALUE "ProductVersion", "1.2.3"
        END
    END
    BLOCK "VarFileInfo"
--- a/src/mux/Makefile.am
+++ b/src/mux/Makefile.am
@ -17,6 +17,6 @@ noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

 libwebpmux_la_LIBADD = ../libwebp.la
-libwebpmux_la_LDFLAGS = -no-undefined -version-info 3:8:0 -lm
+libwebpmux_la_LDFLAGS = -no-undefined -version-info 3:9:0 -lm
 libwebpmuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpmux.pc
--- a/src/mux/libwebpmux.rc
+++ b/src/mux/libwebpmux.rc
@ -6,8 +6,8 @@
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,0,2,2
- PRODUCTVERSION 1,0,2,2
+ FILEVERSION 1,0,2,3
+ PRODUCTVERSION 1,0,2,3
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -24,12 +24,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "Google, Inc."
            VALUE "FileDescription", "libwebpmux DLL"
-            VALUE "FileVersion", "1.2.2"
+            VALUE "FileVersion", "1.2.3"
            VALUE "InternalName", "libwebpmux.dll"
-            VALUE "LegalCopyright", "Copyright (C) 2021"
+            VALUE "LegalCopyright", "Copyright (C) 2022"
            VALUE "OriginalFilename", "libwebpmux.dll"
            VALUE "ProductName", "WebP Image Muxer"
-            VALUE "ProductVersion", "1.2.2"
+            VALUE "ProductVersion", "1.2.3"
        END
    END
    BLOCK "VarFileInfo"
--- a/Show More
+++ b/Show More