Remove the computation of ExtraCost when comparing histograms

Entropy clustering merges symbol histograms to reduce the overall entropy. The cost of 2 added histograms is compared to the 2 costs of the individual histograms and if it is smaller, a merge is done. Except for some symbols (distance and length), the computed cost is the real final cost based on the histogram, and some constant cost (independent from the probabilities of the symbols and hence the merge) because the symbol is encode as Golomb. This constant cost is useless and can be removed. Change-Id: I6271e8c0e4111cdeff544cbdb7dec3c67be5309c
Get AVX2 into WebP lossless
2025-07-03 01:24:30 +02:00 · 2025-03-28 15:00:41 +01:00 · 2025-03-28 11:44:03 +01:00
16 changed files with 1305 additions and 19 deletions
--- a/Makefile.vc
+++ b/Makefile.vc
@ -231,6 +231,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
    $(DIROBJ)\dsp\lossless_sse41.obj \
+    $(DIROBJ)\dsp\lossless_avx2.obj \
    $(DIROBJ)\dsp\rescaler.obj \
    $(DIROBJ)\dsp\rescaler_mips32.obj \
    $(DIROBJ)\dsp\rescaler_mips_dsp_r2.obj \
@ -270,6 +271,7 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\lossless_enc_neon.obj \
    $(DIROBJ)\dsp\lossless_enc_sse2.obj \
    $(DIROBJ)\dsp\lossless_enc_sse41.obj \
+    $(DIROBJ)\dsp\lossless_enc_avx2.obj \
    $(DIROBJ)\dsp\ssim.obj \
    $(DIROBJ)\dsp\ssim_sse2.obj \

--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -94,6 +94,9 @@
 /* Set to 1 if SSE4.1 is supported */
 #cmakedefine WEBP_HAVE_SSE41 1

+/* Set to 1 if AVX2 is supported */
+#cmakedefine WEBP_HAVE_AVX2 1
+
 /* Set to 1 if TIFF library is installed */
 #cmakedefine WEBP_HAVE_TIFF 1

--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@ -38,9 +38,9 @@ function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
 endfunction()

 # those are included in the names of WEBP_USE_* in c++ code.
-set(WEBP_SIMD_FLAGS "SSE41;SSE2;MIPS32;MIPS_DSP_R2;NEON;MSA")
+set(WEBP_SIMD_FLAGS "AVX2;SSE41;SSE2;MIPS32;MIPS_DSP_R2;NEON;MSA")
 set(WEBP_SIMD_FILE_EXTENSIONS
-    "_sse41.c;_sse2.c;_mips32.c;_mips_dsp_r2.c;_neon.c;_msa.c")
+    "_avx2.c;_sse41.c;_sse2.c;_mips32.c;_mips_dsp_r2.c;_neon.c;_msa.c")
 if(MSVC AND CMAKE_C_COMPILER_ID STREQUAL "MSVC")
  # With at least Visual Studio 12 (2013)+ /arch is not necessary to build SSE2
  # or SSE4 code unless a lesser /arch is forced. MSVC does not have a SSE4
@ -50,12 +50,12 @@ if(MSVC AND CMAKE_C_COMPILER_ID STREQUAL "MSVC")
  if(MSVC_VERSION GREATER_EQUAL 1800 AND NOT CMAKE_C_FLAGS MATCHES "/arch:")
    set(SIMD_ENABLE_FLAGS)
  else()
-    set(SIMD_ENABLE_FLAGS "/arch:AVX;/arch:SSE2;;;;")
+    set(SIMD_ENABLE_FLAGS "/arch:AVX2;/arch:AVX;/arch:SSE2;;;;")
  endif()
  set(SIMD_DISABLE_FLAGS)
 else()
-  set(SIMD_ENABLE_FLAGS "-msse4.1;-msse2;-mips32;-mdspr2;-mfpu=neon;-mmsa")
-  set(SIMD_DISABLE_FLAGS "-mno-sse4.1;-mno-sse2;;-mno-dspr2;;-mno-msa")
+  set(SIMD_ENABLE_FLAGS "-mavx2;-msse4.1;-msse2;-mips32;-mdspr2;-mfpu=neon;-mmsa")
+  set(SIMD_DISABLE_FLAGS "-mno-avx2;-mno-sse4.1;-mno-sse2;;-mno-dspr2;;-mno-msa")
 endif()

 set(WEBP_SIMD_FILES_TO_INCLUDE)
--- a/configure.ac
+++ b/configure.ac
@ -161,6 +161,25 @@ AS_IF([test "$GCC" = "yes" ], [
 AC_SUBST([AM_CFLAGS])

 dnl === Check for machine specific flags
+AC_ARG_ENABLE([avx2],
+              AS_HELP_STRING([--disable-avx2],
+                             [Disable detection of AVX2 support
+                              @<:@default=auto@:>@]))
+
+AS_IF([test "x$enable_avx2" != "xno" -a "x$enable_sse4_1" != "xno"
+      -a "x$enable_sse2" != "xno"], [
+  AVX2_FLAGS="$INTRINSICS_CFLAGS $AVX2_FLAGS"
+  TEST_AND_ADD_CFLAGS([AVX2_FLAGS], [-mavx2])
+  AS_IF([test -n "$AVX2_FLAGS"], [
+    SAVED_CFLAGS=$CFLAGS
+    CFLAGS="$CFLAGS $AVX2_FLAGS"
+    AC_CHECK_HEADER([immintrin.h],
+                    [AC_DEFINE(WEBP_HAVE_AVX2, [1],
+                     [Set to 1 if AVX2 is supported])],
+                    [AVX2_FLAGS=""])
+    CFLAGS=$SAVED_CFLAGS])
+  AC_SUBST([AVX2_FLAGS])])
+
 AC_ARG_ENABLE([sse4.1],
              AS_HELP_STRING([--disable-sse4.1],
                             [Disable detection of SSE4.1 support
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -5,6 +5,8 @@ noinst_LTLIBRARIES += libwebpdsp_sse2.la
 noinst_LTLIBRARIES += libwebpdspdecode_sse2.la
 noinst_LTLIBRARIES += libwebpdsp_sse41.la
 noinst_LTLIBRARIES += libwebpdspdecode_sse41.la
+noinst_LTLIBRARIES += libwebpdsp_avx2.la
+noinst_LTLIBRARIES += libwebpdspdecode_avx2.la
 noinst_LTLIBRARIES += libwebpdsp_neon.la
 noinst_LTLIBRARIES += libwebpdspdecode_neon.la
 noinst_LTLIBRARIES += libwebpdsp_msa.la
@ -44,6 +46,11 @@ ENC_SOURCES += lossless_enc.c
 ENC_SOURCES += quant.h
 ENC_SOURCES += ssim.c

+libwebpdspdecode_avx2_la_SOURCES =
+libwebpdspdecode_avx2_la_SOURCES += lossless_avx2.c
+libwebpdspdecode_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdspdecode_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
+
 libwebpdspdecode_sse41_la_SOURCES =
 libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c
 libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c
@ -123,6 +130,12 @@ libwebpdsp_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
 libwebpdsp_sse41_la_LIBADD = libwebpdspdecode_sse41.la

+libwebpdsp_avx2_la_SOURCES =
+libwebpdsp_avx2_la_SOURCES += lossless_enc_avx2.c
+libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
+libwebpdsp_avx2_la_LIBADD = libwebpdspdecode_avx2.la
+
 libwebpdsp_neon_la_SOURCES =
 libwebpdsp_neon_la_SOURCES += cost_neon.c
 libwebpdsp_neon_la_SOURCES += enc_neon.c
@ -167,6 +180,7 @@ libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_LIBADD =
 libwebpdsp_la_LIBADD += libwebpdsp_sse2.la
 libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
+libwebpdsp_la_LIBADD += libwebpdsp_avx2.la
 libwebpdsp_la_LIBADD += libwebpdsp_neon.la
 libwebpdsp_la_LIBADD += libwebpdsp_msa.la
 libwebpdsp_la_LIBADD += libwebpdsp_mips32.la
@ -180,6 +194,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_LIBADD =
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la
+  libwebpdspdecode_la_LIBADD += libwebpdspdecode_avx2.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_neon.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_msa.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_mips32.la
--- a/src/dsp/cpu.h
+++ b/src/dsp/cpu.h
@ -56,6 +56,11 @@
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_AVX2  // Visual C++ AVX2 targets
+#endif
 #endif

 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
@ -80,6 +85,16 @@
 #define WEBP_HAVE_SSE41
 #endif

+#if (defined(__AVX2__) || defined(WEBP_MSC_AVX2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_AVX2))
+#define WEBP_USE_AVX2
+#endif
+
+#if defined(WEBP_USE_AVX2) && !defined(WEBP_HAVE_AVX2)
+#define WEBP_HAVE_AVX2
+#endif
+
+#undef WEBP_MSC_AVX2
 #undef WEBP_MSC_SSE41
 #undef WEBP_MSC_SSE2

--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -577,16 +577,21 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
 //------------------------------------------------------------------------------

 VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed_SSE;
 VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
+VP8LPredictorAddSubFunc VP8LPredictorsAdd_SSE[16];
 VP8LPredictorFunc VP8LPredictors[16];

 // exposed plain-C implementations
 VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];

 VP8LTransformColorInverseFunc VP8LTransformColorInverse;
+VP8LTransformColorInverseFunc VP8LTransformColorInverse_SSE;

 VP8LConvertFunc VP8LConvertBGRAToRGB;
+VP8LConvertFunc VP8LConvertBGRAToRGB_SSE;
 VP8LConvertFunc VP8LConvertBGRAToRGBA;
+VP8LConvertFunc VP8LConvertBGRAToRGBA_SSE;
 VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 VP8LConvertFunc VP8LConvertBGRAToRGB565;
 VP8LConvertFunc VP8LConvertBGRAToBGR;
@ -597,6 +602,7 @@ VP8LMapAlphaFunc VP8LMapColor8b;
 extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitSSE41(void);
+extern void VP8LDspInitAVX2(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);
@ -649,6 +655,11 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 #if defined(WEBP_HAVE_SSE41)
      if (VP8GetCPUInfo(kSSE4_1)) {
        VP8LDspInitSSE41();
+#if defined(WEBP_HAVE_AVX2)
+        if (VP8GetCPUInfo(kAVX2)) {
+          VP8LDspInitAVX2();
+        }
+#endif
      }
 #endif
    }
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -64,10 +64,12 @@ typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                        uint32_t* WEBP_RESTRICT out);
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
+extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_SSE[16];

 typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
                                             int num_pixels, uint32_t* dst);
 extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed_SSE;

 typedef struct {
  // Note: the members are uint8_t, so that any negative values are
@ -80,6 +82,7 @@ typedef void (*VP8LTransformColorInverseFunc)(const VP8LMultipliers* const m,
                                              const uint32_t* src,
                                              int num_pixels, uint32_t* dst);
 extern VP8LTransformColorInverseFunc VP8LTransformColorInverse;
+extern VP8LTransformColorInverseFunc VP8LTransformColorInverse_SSE;

 struct VP8LTransform;  // Defined in dec/vp8li.h.

@ -99,6 +102,8 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;
+extern VP8LConvertFunc VP8LConvertBGRAToRGB_SSE;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA_SSE;

 // Converts from BGRA to other color spaces.
 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
@ -149,21 +154,25 @@ void VP8LDspInit(void);

 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed_SSE;
 typedef void (*VP8LTransformColorFunc)(
    const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT dst,
    int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
+extern VP8LTransformColorFunc VP8LTransformColor_SSE;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
    const uint32_t* WEBP_RESTRICT argb, int stride,
    int tile_width, int tile_height,
    int green_to_blue, int red_to_blue, uint32_t histo[]);
 extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms_SSE;

 typedef void (*VP8LCollectColorRedTransformsFunc)(
    const uint32_t* WEBP_RESTRICT argb, int stride,
    int tile_width, int tile_height,
    int green_to_red, uint32_t histo[]);
 extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms_SSE;

 // Expose some C-only fallback functions
 void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
@ -181,6 +190,7 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,

 extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
+extern VP8LPredictorAddSubFunc VP8LPredictorsSub_SSE[16];

 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
@ -255,6 +265,7 @@ typedef void (*VP8LBundleColorMapFunc)(const uint8_t* WEBP_RESTRICT const row,
                                       int width, int xbits,
                                       uint32_t* WEBP_RESTRICT dst);
 extern VP8LBundleColorMapFunc VP8LBundleColorMap;
+extern VP8LBundleColorMapFunc VP8LBundleColorMap_SSE;
 void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
                          int width, int xbits, uint32_t* WEBP_RESTRICT dst);

--- a/src/dsp/lossless_avx2.c
+++ b/src/dsp/lossless_avx2.c
@ -0,0 +1,442 @@
+// Copyright 2025 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// AVX2 variant of methods for lossless decoder
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_AVX2)
+
+#include <immintrin.h>
+
+#include "src/dsp/cpu.h"
+#include "src/dsp/lossless.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
+
+//------------------------------------------------------------------------------
+// Predictor Transform
+
+static WEBP_INLINE void Average2_m256i(const __m256i* const a0,
+                                       const __m256i* const a1,
+                                       __m256i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m256i ones = _mm256_set1_epi8(1);
+  const __m256i avg1 = _mm256_avg_epu8(*a0, *a1);
+  const __m256i one = _mm256_and_si256(_mm256_xor_si256(*a0, *a1), ones);
+  *avg = _mm256_sub_epi8(avg1, one);
+}
+
+// Batch versions of those functions.
+
+// Predictor0: ARGB_BLACK.
+static void PredictorAdd0_AVX2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  const __m256i black = _mm256_set1_epi32((int)ARGB_BLACK);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    const __m256i res = _mm256_add_epi8(src, black);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_SSE[0](in + i, NULL, num_pixels - i, out + i);
+  }
+  (void)upper;
+}
+
+// Predictor1: left.
+static void PredictorAdd1_AVX2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  __m256i prev = _mm256_set1_epi32((int)out[-1]);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    // h | g | f | e | d | c | b | a
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    // g | f | e | 0 | c | b | a | 0
+    const __m256i shift0 = _mm256_slli_si256(src, 4);
+    // g + h | f + g | e + f | e | c + d | b + c | a + b | a
+    const __m256i sum0 = _mm256_add_epi8(src, shift0);
+    // e + f | e | 0 | 0 | a + b | a | 0 | 0
+    const __m256i shift1 = _mm256_slli_si256(sum0, 8);
+    // e + f + g + h | e + f + g | e + f | e | a + b + c + d | a + b + c | a + b
+    // | a
+    const __m256i sum1 = _mm256_add_epi8(sum0, shift1);
+    // Add a + b + c + d to the upper lane.
+    const int32_t sum_abcd = _mm256_extract_epi32(sum1, 3);
+    const __m256i sum2 = _mm256_add_epi8(
+        sum1,
+        _mm256_set_epi32(sum_abcd, sum_abcd, sum_abcd, sum_abcd, 0, 0, 0, 0));
+
+    const __m256i res = _mm256_add_epi8(sum2, prev);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+    // replicate last res output in prev.
+    prev = _mm256_permutevar8x32_epi32(
+        res, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 7));
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_SSE[1](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Macro that adds 32-bit integers from IN using mod 256 arithmetic
+// per 8 bit channel.
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorAdd##X##_AVX2(const uint32_t* in,                    \
+                                     const uint32_t* upper, int num_pixels, \
+                                     uint32_t* WEBP_RESTRICT out) {         \
+    int i;                                                                  \
+    for (i = 0; i + 8 <= num_pixels; i += 8) {                              \
+      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);       \
+      const __m256i other = _mm256_loadu_si256((const __m256i*)&(IN));      \
+      const __m256i res = _mm256_add_epi8(src, other);                      \
+      _mm256_storeu_si256((__m256i*)&out[i], res);                          \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
+    }                                                                       \
+  }
+
+// Predictor2: Top.
+GENERATE_PREDICTOR_1(2, upper[i])
+// Predictor3: Top-right.
+GENERATE_PREDICTOR_1(3, upper[i + 1])
+// Predictor4: Top-left.
+GENERATE_PREDICTOR_1(4, upper[i - 1])
+#undef GENERATE_PREDICTOR_1
+
+// Due to averages with integers, values cannot be accumulated in parallel for
+// predictors 5 to 7.
+
+#define GENERATE_PREDICTOR_2(X, IN)                                         \
+  static void PredictorAdd##X##_AVX2(const uint32_t* in,                    \
+                                     const uint32_t* upper, int num_pixels, \
+                                     uint32_t* WEBP_RESTRICT out) {         \
+    int i;                                                                  \
+    for (i = 0; i + 8 <= num_pixels; i += 8) {                              \
+      const __m256i Tother = _mm256_loadu_si256((const __m256i*)&(IN));     \
+      const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);      \
+      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);       \
+      __m256i avg, res;                                                     \
+      Average2_m256i(&T, &Tother, &avg);                                    \
+      res = _mm256_add_epi8(avg, src);                                      \
+      _mm256_storeu_si256((__m256i*)&out[i], res);                          \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
+    }                                                                       \
+  }
+// Predictor8: average TL T.
+GENERATE_PREDICTOR_2(8, upper[i - 1])
+// Predictor9: average T TR.
+GENERATE_PREDICTOR_2(9, upper[i + 1])
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(OUT)                                  \
+  do {                                                  \
+    __m256i avgLTL, avg;                                \
+    Average2_m256i(&L, &TL, &avgLTL);                   \
+    Average2_m256i(&avgTTR, &avgLTL, &avg);             \
+    L = _mm256_add_epi8(avg, src);                      \
+    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \
+  } while (0)
+
+#define DO_PRED10_SHIFT                                         \
+  do {                                                          \
+    /* Rotate the pre-computed values for the next iteration.*/ \
+    avgTTR = _mm256_srli_si256(avgTTR, 4);                      \
+    TL = _mm256_srli_si256(TL, 4);                              \
+    src = _mm256_srli_si256(src, 4);                            \
+  } while (0)
+
+static void PredictorAdd10_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i, j;
+  __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]);
+    __m256i avgTTR;
+    Average2_m256i(&T, &TR, &avgTTR);
+    {
+      const __m256i avgTTR_bak = avgTTR;
+      const __m256i TL_bak = TL;
+      const __m256i src_bak = src;
+      for (j = 0; j < 4; ++j) {
+        DO_PRED10(j);
+        DO_PRED10_SHIFT;
+      }
+      avgTTR = _mm256_permute2x128_si256(avgTTR_bak, avgTTR_bak, 1);
+      TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);
+      src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
+      for (; j < 8; ++j) {
+        DO_PRED10(j);
+        DO_PRED10_SHIFT;
+      }
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_SSE[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+#undef DO_PRED10
+#undef DO_PRED10_SHIFT
+
+// Predictor11: select.
+#define DO_PRED11(OUT)                                                      \
+  do {                                                                      \
+    const __m256i L_lo = _mm256_unpacklo_epi32(L, T);                       \
+    const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);                     \
+    const __m256i pb = _mm256_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
+    const __m256i mask = _mm256_cmpgt_epi32(pb, pa);                        \
+    const __m256i A = _mm256_and_si256(mask, L);                            \
+    const __m256i B = _mm256_andnot_si256(mask, T);                         \
+    const __m256i pred = _mm256_or_si256(A, B); /* pred = (pa > b)? L : T*/ \
+    L = _mm256_add_epi8(src, pred);                                         \
+    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L);                     \
+  } while (0)
+
+#define DO_PRED11_SHIFT                                       \
+  do {                                                        \
+    /* Shift the pre-computed value for the next iteration.*/ \
+    T = _mm256_srli_si256(T, 4);                              \
+    TL = _mm256_srli_si256(TL, 4);                            \
+    src = _mm256_srli_si256(src, 4);                          \
+    pa = _mm256_srli_si256(pa, 4);                            \
+  } while (0)
+
+static void PredictorAdd11_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i, j;
+  __m256i pa;
+  __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    {
+      // We can unpack with any value on the upper 32 bits, provided it's the
+      // same on both operands (so that their sum of abs diff is zero). Here we
+      // use T.
+      const __m256i T_lo = _mm256_unpacklo_epi32(T, T);
+      const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);
+      const __m256i T_hi = _mm256_unpackhi_epi32(T, T);
+      const __m256i TL_hi = _mm256_unpackhi_epi32(TL, T);
+      const __m256i s_lo = _mm256_sad_epu8(T_lo, TL_lo);
+      const __m256i s_hi = _mm256_sad_epu8(T_hi, TL_hi);
+      pa = _mm256_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
+    }
+    {
+      const __m256i T_bak = T;
+      const __m256i TL_bak = TL;
+      const __m256i src_bak = src;
+      const __m256i pa_bak = pa;
+      for (j = 0; j < 4; ++j) {
+        DO_PRED11(j);
+        DO_PRED11_SHIFT;
+      }
+      T = _mm256_permute2x128_si256(T_bak, T_bak, 1);
+      TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);
+      src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
+      pa = _mm256_permute2x128_si256(pa_bak, pa_bak, 1);
+      for (; j < 8; ++j) {
+        DO_PRED11(j);
+        DO_PRED11_SHIFT;
+      }
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_SSE[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+#undef DO_PRED11
+#undef DO_PRED11_SHIFT
+
+// Predictor12: ClampedAddSubtractFull.
+#define DO_PRED12(DIFF, OUT)                              \
+  do {                                                    \
+    const __m256i all = _mm256_add_epi16(L, (DIFF));      \
+    const __m256i alls = _mm256_packus_epi16(all, all);   \
+    const __m256i res = _mm256_add_epi8(src, alls);       \
+    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(res); \
+    L = _mm256_unpacklo_epi8(res, zero);                  \
+  } while (0)
+
+#define DO_PRED12_SHIFT(DIFF, LANE)                           \
+  do {                                                        \
+    /* Shift the pre-computed value for the next iteration.*/ \
+    if ((LANE) == 0) (DIFF) = _mm256_srli_si256(DIFF, 8);     \
+    src = _mm256_srli_si256(src, 4);                          \
+  } while (0)
+
+static void PredictorAdd12_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i L8 = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
+  __m256i L = _mm256_unpacklo_epi8(L8, zero);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    // Load 8 pixels at a time.
+    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i T_lo = _mm256_unpacklo_epi8(T, zero);
+    const __m256i T_hi = _mm256_unpackhi_epi8(T, zero);
+    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero);
+    const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero);
+    __m256i diff_lo = _mm256_sub_epi16(T_lo, TL_lo);
+    __m256i diff_hi = _mm256_sub_epi16(T_hi, TL_hi);
+    const __m256i diff_lo_bak = diff_lo;
+    const __m256i diff_hi_bak = diff_hi;
+    const __m256i src_bak = src;
+    DO_PRED12(diff_lo, 0);
+    DO_PRED12_SHIFT(diff_lo, 0);
+    DO_PRED12(diff_lo, 1);
+    DO_PRED12_SHIFT(diff_lo, 0);
+    DO_PRED12(diff_hi, 2);
+    DO_PRED12_SHIFT(diff_hi, 0);
+    DO_PRED12(diff_hi, 3);
+    DO_PRED12_SHIFT(diff_hi, 0);
+
+    // Process the upper lane.
+    diff_lo = _mm256_permute2x128_si256(diff_lo_bak, diff_lo_bak, 1);
+    diff_hi = _mm256_permute2x128_si256(diff_hi_bak, diff_hi_bak, 1);
+    src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
+
+    DO_PRED12(diff_lo, 4);
+    DO_PRED12_SHIFT(diff_lo, 0);
+    DO_PRED12(diff_lo, 5);
+    DO_PRED12_SHIFT(diff_lo, 1);
+    DO_PRED12(diff_hi, 6);
+    DO_PRED12_SHIFT(diff_hi, 0);
+    DO_PRED12(diff_hi, 7);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_SSE[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+#undef DO_PRED12
+#undef DO_PRED12_SHIFT
+
+// Due to averages with integers, values cannot be accumulated in parallel for
+// predictors 13.
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void AddGreenToBlueAndRed_AVX2(const uint32_t* const src, int num_pixels,
+                                      uint32_t* dst) {
+  int i;
+  const __m256i kCstShuffle = _mm256_set_epi8(
+      -1, 29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13,
+      -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i in = _mm256_loadu_si256((const __m256i*)&src[i]);  // argb
+    const __m256i in_0g0g = _mm256_shuffle_epi8(in, kCstShuffle);    // 0g0g
+    const __m256i out = _mm256_add_epi8(in, in_0g0g);
+    _mm256_storeu_si256((__m256i*)&dst[i], out);
+  }
+  // fallthrough and finish off with SSE.
+  if (i != num_pixels) {
+    VP8LAddGreenToBlueAndRed_SSE(src + i, num_pixels - i, dst + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColorInverse_AVX2(const VP8LMultipliers* const m,
+                                       const uint32_t* const src,
+                                       int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+  const __m256i mults_rb =
+      _mm256_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 |
+                              (CST(green_to_blue_) & 0xffff)));
+  const __m256i mults_b2 = _mm256_set1_epi32(CST(red_to_blue_));
+#undef CST
+  const __m256i mask_ag = _mm256_set1_epi32((int)0xff00ff00);
+  const __m256i perm1 = _mm256_setr_epi8(
+      -1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13, -1, 17, -1, 17,
+      -1, 21, -1, 21, -1, 25, -1, 25, -1, 29, -1, 29);
+  const __m256i perm2 = _mm256_setr_epi8(
+      -1, 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 18, -1,
+      -1, -1, 22, -1, -1, -1, 26, -1, -1, -1, 30, -1, -1);
+  int i;
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i A = _mm256_loadu_si256((const __m256i*)(src + i));
+    const __m256i B = _mm256_shuffle_epi8(A, perm1);  // argb -> g0g0
+    const __m256i C = _mm256_mulhi_epi16(B, mults_rb);
+    const __m256i D = _mm256_add_epi8(A, C);
+    const __m256i E = _mm256_shuffle_epi8(D, perm2);
+    const __m256i F = _mm256_mulhi_epi16(E, mults_b2);
+    const __m256i G = _mm256_add_epi8(D, F);
+    const __m256i out = _mm256_blendv_epi8(G, A, mask_ag);
+    _mm256_storeu_si256((__m256i*)&dst[i], out);
+  }
+  // Fall-back to SSE-version for left-overs.
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_SSE(m, src + i, num_pixels - i, dst + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void ConvertBGRAToRGBA_AVX2(const uint32_t* WEBP_RESTRICT src,
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
+  const __m256i* in = (const __m256i*)src;
+  __m256i* out = (__m256i*)dst;
+  while (num_pixels >= 8) {
+    const __m256i A = _mm256_loadu_si256(in++);
+    const __m256i B = _mm256_shuffle_epi8(
+        A,
+        _mm256_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2,
+                        15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2));
+    _mm256_storeu_si256(out++, B);
+    num_pixels -= 8;
+  }
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGBA_SSE((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitAVX2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitAVX2(void) {
+  VP8LPredictorsAdd[0] = PredictorAdd0_AVX2;
+  VP8LPredictorsAdd[1] = PredictorAdd1_AVX2;
+  VP8LPredictorsAdd[2] = PredictorAdd2_AVX2;
+  VP8LPredictorsAdd[3] = PredictorAdd3_AVX2;
+  VP8LPredictorsAdd[4] = PredictorAdd4_AVX2;
+  VP8LPredictorsAdd[8] = PredictorAdd8_AVX2;
+  VP8LPredictorsAdd[9] = PredictorAdd9_AVX2;
+  VP8LPredictorsAdd[10] = PredictorAdd10_AVX2;
+  VP8LPredictorsAdd[11] = PredictorAdd11_AVX2;
+  VP8LPredictorsAdd[12] = PredictorAdd12_AVX2;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_AVX2;
+  VP8LTransformColorInverse = TransformColorInverse_AVX2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_AVX2;
+}
+
+#else  // !WEBP_USE_AVX2
+
+WEBP_DSP_INIT_STUB(VP8LDspInitAVX2)
+
+#endif  // WEBP_USE_AVX2
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@ -713,11 +713,15 @@ GENERATE_PREDICTOR_SUB(13)
 //------------------------------------------------------------------------------

 VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed_SSE;

 VP8LTransformColorFunc VP8LTransformColor;
+VP8LTransformColorFunc VP8LTransformColor_SSE;

 VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms_SSE;
 VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms_SSE;

 VP8LFastLog2SlowFunc VP8LFastLog2Slow;
 VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
@ -735,13 +739,16 @@ VP8LAddVectorEqFunc VP8LAddVectorEq;

 VP8LVectorMismatchFunc VP8LVectorMismatch;
 VP8LBundleColorMapFunc VP8LBundleColorMap;
+VP8LBundleColorMapFunc VP8LBundleColorMap_SSE;

 VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
 VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
+VP8LPredictorAddSubFunc VP8LPredictorsSub_SSE[16];

 extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8LEncDspInitSSE2(void);
 extern void VP8LEncDspInitSSE41(void);
+extern void VP8LEncDspInitAVX2(void);
 extern void VP8LEncDspInitNEON(void);
 extern void VP8LEncDspInitMIPS32(void);
 extern void VP8LEncDspInitMIPSdspR2(void);
@ -818,6 +825,11 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
 #if defined(WEBP_HAVE_SSE41)
      if (VP8GetCPUInfo(kSSE4_1)) {
        VP8LEncDspInitSSE41();
+#if defined(WEBP_HAVE_AVX2)
+        if (VP8GetCPUInfo(kAVX2)) {
+          VP8LEncDspInitAVX2();
+        }
+#endif
      }
 #endif
    }
--- a/src/dsp/lossless_enc_avx2.c
+++ b/src/dsp/lossless_enc_avx2.c
@ -0,0 +1,733 @@
+// Copyright 2025 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// AVX2 variant of methods for lossless encoder
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_AVX2)
+#include <assert.h>
+#include <immintrin.h>
+
+#include "src/dsp/cpu.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_AVX2(uint32_t* argb_data,
+                                             int num_pixels) {
+  int i;
+  const __m256i kCstShuffle = _mm256_set_epi8(
+      -1, 29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13,
+      -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i in = _mm256_loadu_si256((__m256i*)&argb_data[i]);  // argb
+    const __m256i in_0g0g = _mm256_shuffle_epi8(in, kCstShuffle);
+    const __m256i out = _mm256_sub_epi8(in, in_0g0g);
+    _mm256_storeu_si256((__m256i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-SSE
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_SSE(argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+#define MK_CST_16(HI, LO) \
+  _mm256_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void TransformColor_AVX2(const VP8LMultipliers* WEBP_RESTRICT const m,
+                                uint32_t* WEBP_RESTRICT argb_data,
+                                int num_pixels) {
+  const __m256i mults_rb =
+      MK_CST_16(CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
+  const __m256i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
+  const __m256i mask_rb = _mm256_set1_epi32(0x00ff00ff);  // red-blue masks
+  const __m256i kCstShuffle = _mm256_set_epi8(
+      29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13, -1,
+      13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1, -1);
+  int i;
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i in = _mm256_loadu_si256((__m256i*)&argb_data[i]);  // argb
+    const __m256i A = _mm256_shuffle_epi8(in, kCstShuffle);          // g0g0
+    const __m256i B = _mm256_mulhi_epi16(A, mults_rb);  // x dr  x db1
+    const __m256i C = _mm256_slli_epi16(in, 8);         // r 0   b   0
+    const __m256i D = _mm256_mulhi_epi16(C, mults_b2);  // x db2 0   0
+    const __m256i E = _mm256_srli_epi32(D, 16);         // 0 0   x db2
+    const __m256i F = _mm256_add_epi8(E, B);            // x dr  x  db
+    const __m256i G = _mm256_and_si256(F, mask_rb);     // 0 dr  0  db
+    const __m256i out = _mm256_sub_epi8(in, G);
+    _mm256_storeu_si256((__m256i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LTransformColor_SSE(m, argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+#define SPAN 16
+static void CollectColorBlueTransforms_AVX2(const uint32_t* WEBP_RESTRICT argb,
+                                            int stride, int tile_width,
+                                            int tile_height, int green_to_blue,
+                                            int red_to_blue, uint32_t histo[]) {
+  const __m256i mult =
+      MK_CST_16(CST_5b(red_to_blue) + 256, CST_5b(green_to_blue));
+  const __m256i perm = _mm256_setr_epi8(
+      -1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14, -1, 17, -1, 18,
+      -1, 21, -1, 22, -1, 25, -1, 26, -1, 29, -1, 30);
+  if (tile_width >= 8) {
+    int y, i;
+    for (y = 0; y < tile_height; ++y) {
+      uint8_t values[32];
+      const uint32_t* const src = argb + y * stride;
+      const __m256i A1 = _mm256_loadu_si256((const __m256i*)src);
+      const __m256i B1 = _mm256_shuffle_epi8(A1, perm);
+      const __m256i C1 = _mm256_mulhi_epi16(B1, mult);
+      const __m256i D1 = _mm256_sub_epi16(A1, C1);
+      __m256i E = _mm256_add_epi16(_mm256_srli_epi32(D1, 16), D1);
+      int x;
+      for (x = 8; x + 8 <= tile_width; x += 8) {
+        const __m256i A2 = _mm256_loadu_si256((const __m256i*)(src + x));
+        __m256i B2, C2, D2;
+        _mm256_storeu_si256((__m256i*)values, E);
+        for (i = 0; i < 32; i += 4) ++histo[values[i]];
+        B2 = _mm256_shuffle_epi8(A2, perm);
+        C2 = _mm256_mulhi_epi16(B2, mult);
+        D2 = _mm256_sub_epi16(A2, C2);
+        E = _mm256_add_epi16(_mm256_srli_epi32(D2, 16), D2);
+      }
+      _mm256_storeu_si256((__m256i*)values, E);
+      for (i = 0; i < 32; i += 4) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & 7;
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_SSE(argb + tile_width - left_over, stride,
+                                         left_over, tile_height, green_to_blue,
+                                         red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_AVX2(const uint32_t* WEBP_RESTRICT argb,
+                                           int stride, int tile_width,
+                                           int tile_height, int green_to_red,
+                                           uint32_t histo[]) {
+  const __m256i mult = MK_CST_16(0, CST_5b(green_to_red));
+  const __m256i mask_g = _mm256_set1_epi32(0x0000ff00);
+  if (tile_width >= 8) {
+    int y, i;
+    for (y = 0; y < tile_height; ++y) {
+      uint8_t values[32];
+      const uint32_t* const src = argb + y * stride;
+      const __m256i A1 = _mm256_loadu_si256((const __m256i*)src);
+      const __m256i B1 = _mm256_and_si256(A1, mask_g);
+      const __m256i C1 = _mm256_madd_epi16(B1, mult);
+      __m256i D = _mm256_sub_epi16(A1, C1);
+      int x;
+      for (x = 8; x + 8 <= tile_width; x += 8) {
+        const __m256i A2 = _mm256_loadu_si256((const __m256i*)(src + x));
+        __m256i B2, C2;
+        _mm256_storeu_si256((__m256i*)values, D);
+        for (i = 2; i < 32; i += 4) ++histo[values[i]];
+        B2 = _mm256_and_si256(A2, mask_g);
+        C2 = _mm256_madd_epi16(B2, mult);
+        D = _mm256_sub_epi16(A2, C2);
+      }
+      _mm256_storeu_si256((__m256i*)values, D);
+      for (i = 2; i < 32; i += 4) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & 7;
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_SSE(argb + tile_width - left_over, stride,
+                                        left_over, tile_height, green_to_red,
+                                        histo);
+    }
+  }
+}
+#undef SPAN
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm256_add_epi32).
+// But that's ok since the histogram values are less than 1<<28 (max picture
+// size).
+static void AddVector_AVX2(const uint32_t* WEBP_RESTRICT a,
+                           const uint32_t* WEBP_RESTRICT b,
+                           uint32_t* WEBP_RESTRICT out, int size) {
+  int i = 0;
+  int aligned_size = size & ~31;
+  // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
+  // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
+  // 2). See the usage in VP8LHistogramAdd().
+  assert(size >= 32);
+  assert(size % 2 == 0);
+
+  do {
+    const __m256i a0 = _mm256_loadu_si256((const __m256i*)&a[i + 0]);
+    const __m256i a1 = _mm256_loadu_si256((const __m256i*)&a[i + 8]);
+    const __m256i a2 = _mm256_loadu_si256((const __m256i*)&a[i + 16]);
+    const __m256i a3 = _mm256_loadu_si256((const __m256i*)&a[i + 24]);
+    const __m256i b0 = _mm256_loadu_si256((const __m256i*)&b[i + 0]);
+    const __m256i b1 = _mm256_loadu_si256((const __m256i*)&b[i + 8]);
+    const __m256i b2 = _mm256_loadu_si256((const __m256i*)&b[i + 16]);
+    const __m256i b3 = _mm256_loadu_si256((const __m256i*)&b[i + 24]);
+    _mm256_storeu_si256((__m256i*)&out[i + 0], _mm256_add_epi32(a0, b0));
+    _mm256_storeu_si256((__m256i*)&out[i + 8], _mm256_add_epi32(a1, b1));
+    _mm256_storeu_si256((__m256i*)&out[i + 16], _mm256_add_epi32(a2, b2));
+    _mm256_storeu_si256((__m256i*)&out[i + 24], _mm256_add_epi32(a3, b3));
+    i += 32;
+  } while (i != aligned_size);
+
+  if ((size & 16) != 0) {
+    const __m256i a0 = _mm256_loadu_si256((const __m256i*)&a[i + 0]);
+    const __m256i a1 = _mm256_loadu_si256((const __m256i*)&a[i + 8]);
+    const __m256i b0 = _mm256_loadu_si256((const __m256i*)&b[i + 0]);
+    const __m256i b1 = _mm256_loadu_si256((const __m256i*)&b[i + 8]);
+    _mm256_storeu_si256((__m256i*)&out[i + 0], _mm256_add_epi32(a0, b0));
+    _mm256_storeu_si256((__m256i*)&out[i + 8], _mm256_add_epi32(a1, b1));
+    i += 16;
+  }
+
+  size &= 15;
+  if (size == 8) {
+    const __m256i a0 = _mm256_loadu_si256((const __m256i*)&a[i]);
+    const __m256i b0 = _mm256_loadu_si256((const __m256i*)&b[i]);
+    _mm256_storeu_si256((__m256i*)&out[i], _mm256_add_epi32(a0, b0));
+  } else {
+    for (; size--; ++i) {
+      out[i] = a[i] + b[i];
+    }
+  }
+}
+
+static void AddVectorEq_AVX2(const uint32_t* WEBP_RESTRICT a,
+                             uint32_t* WEBP_RESTRICT out, int size) {
+  int i = 0;
+  int aligned_size = size & ~31;
+  // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
+  // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
+  // 2). See the usage in VP8LHistogramAdd().
+  assert(size >= 32);
+  assert(size % 2 == 0);
+
+  do {
+    const __m256i a0 = _mm256_loadu_si256((const __m256i*)&a[i + 0]);
+    const __m256i a1 = _mm256_loadu_si256((const __m256i*)&a[i + 8]);
+    const __m256i a2 = _mm256_loadu_si256((const __m256i*)&a[i + 16]);
+    const __m256i a3 = _mm256_loadu_si256((const __m256i*)&a[i + 24]);
+    const __m256i b0 = _mm256_loadu_si256((const __m256i*)&out[i + 0]);
+    const __m256i b1 = _mm256_loadu_si256((const __m256i*)&out[i + 8]);
+    const __m256i b2 = _mm256_loadu_si256((const __m256i*)&out[i + 16]);
+    const __m256i b3 = _mm256_loadu_si256((const __m256i*)&out[i + 24]);
+    _mm256_storeu_si256((__m256i*)&out[i + 0], _mm256_add_epi32(a0, b0));
+    _mm256_storeu_si256((__m256i*)&out[i + 8], _mm256_add_epi32(a1, b1));
+    _mm256_storeu_si256((__m256i*)&out[i + 16], _mm256_add_epi32(a2, b2));
+    _mm256_storeu_si256((__m256i*)&out[i + 24], _mm256_add_epi32(a3, b3));
+    i += 32;
+  } while (i != aligned_size);
+
+  if ((size & 16) != 0) {
+    const __m256i a0 = _mm256_loadu_si256((const __m256i*)&a[i + 0]);
+    const __m256i a1 = _mm256_loadu_si256((const __m256i*)&a[i + 8]);
+    const __m256i b0 = _mm256_loadu_si256((const __m256i*)&out[i + 0]);
+    const __m256i b1 = _mm256_loadu_si256((const __m256i*)&out[i + 8]);
+    _mm256_storeu_si256((__m256i*)&out[i + 0], _mm256_add_epi32(a0, b0));
+    _mm256_storeu_si256((__m256i*)&out[i + 8], _mm256_add_epi32(a1, b1));
+    i += 16;
+  }
+
+  size &= 15;
+  if (size == 8) {
+    const __m256i a0 = _mm256_loadu_si256((const __m256i*)&a[i]);
+    const __m256i b0 = _mm256_loadu_si256((const __m256i*)&out[i]);
+    _mm256_storeu_si256((__m256i*)&out[i], _mm256_add_epi32(a0, b0));
+  } else {
+    for (; size--; ++i) {
+      out[i] += a[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entropy
+
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+
+static uint64_t CombinedShannonEntropy_AVX2(const uint32_t X[256],
+                                            const uint32_t Y[256]) {
+  int i;
+  uint64_t retval = 0;
+  uint32_t sumX = 0, sumXY = 0;
+  const __m256i zero = _mm256_setzero_si256();
+
+  for (i = 0; i < 256; i += 32) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i*)(X + i + 0));
+    const __m256i y0 = _mm256_loadu_si256((const __m256i*)(Y + i + 0));
+    const __m256i x1 = _mm256_loadu_si256((const __m256i*)(X + i + 8));
+    const __m256i y1 = _mm256_loadu_si256((const __m256i*)(Y + i + 8));
+    const __m256i x2 = _mm256_loadu_si256((const __m256i*)(X + i + 16));
+    const __m256i y2 = _mm256_loadu_si256((const __m256i*)(Y + i + 16));
+    const __m256i x3 = _mm256_loadu_si256((const __m256i*)(X + i + 24));
+    const __m256i y3 = _mm256_loadu_si256((const __m256i*)(Y + i + 24));
+    const __m256i x4 = _mm256_packs_epi16(_mm256_packs_epi32(x0, x1),
+                                          _mm256_packs_epi32(x2, x3));
+    const __m256i y4 = _mm256_packs_epi16(_mm256_packs_epi32(y0, y1),
+                                          _mm256_packs_epi32(y2, y3));
+    // Packed pixels are actually in order: ... 17 16 12 11 10 9 8 3 2 1 0
+    const __m256i x5 = _mm256_permutevar8x32_epi32(
+        x4, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+    const __m256i y5 = _mm256_permutevar8x32_epi32(
+        y4, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+    const uint32_t mx =
+        (uint32_t)_mm256_movemask_epi8(_mm256_cmpgt_epi8(x5, zero));
+    uint32_t my =
+        (uint32_t)_mm256_movemask_epi8(_mm256_cmpgt_epi8(y5, zero)) | mx;
+    while (my) {
+      const int32_t j = BitsCtz(my);
+      uint32_t xy;
+      if ((mx >> j) & 1) {
+        const int x = X[i + j];
+        sumXY += x;
+        retval += VP8LFastSLog2(x);
+      }
+      xy = X[i + j] + Y[i + j];
+      sumX += xy;
+      retval += VP8LFastSLog2(xy);
+      my &= my - 1;
+    }
+  }
+  retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
+  return retval;
+}
+
+#else
+
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC   // won't be faster
+
+#endif
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_AVX2(const uint32_t* const array1,
+                               const uint32_t* const array2, int length) {
+  int match_len;
+
+  if (length >= 24) {
+    __m256i A0 = _mm256_loadu_si256((const __m256i*)&array1[0]);
+    __m256i A1 = _mm256_loadu_si256((const __m256i*)&array2[0]);
+    match_len = 0;
+    do {
+      // Loop unrolling and early load both provide a speedup of 10% for the
+      // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
+      const __m256i cmpA = _mm256_cmpeq_epi32(A0, A1);
+      const __m256i B0 =
+          _mm256_loadu_si256((const __m256i*)&array1[match_len + 8]);
+      const __m256i B1 =
+          _mm256_loadu_si256((const __m256i*)&array2[match_len + 8]);
+      if ((uint32_t)_mm256_movemask_epi8(cmpA) != 0xffffffff) break;
+      match_len += 8;
+
+      {
+        const __m256i cmpB = _mm256_cmpeq_epi32(B0, B1);
+        A0 = _mm256_loadu_si256((const __m256i*)&array1[match_len + 8]);
+        A1 = _mm256_loadu_si256((const __m256i*)&array2[match_len + 8]);
+        if ((uint32_t)_mm256_movemask_epi8(cmpB) != 0xffffffff) break;
+        match_len += 8;
+      }
+    } while (match_len + 24 < length);
+  } else {
+    match_len = 0;
+    // Unroll the potential first two loops.
+    if (length >= 8 &&
+        (uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+            _mm256_loadu_si256((const __m256i*)&array1[0]),
+            _mm256_loadu_si256((const __m256i*)&array2[0]))) == 0xffffffff) {
+      match_len = 8;
+      if (length >= 16 &&
+          (uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+              _mm256_loadu_si256((const __m256i*)&array1[8]),
+              _mm256_loadu_si256((const __m256i*)&array2[8]))) == 0xffffffff) {
+        match_len = 16;
+      }
+    }
+  }
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+static void BundleColorMap_AVX2(const uint8_t* WEBP_RESTRICT const row,
+                                int width, int xbits,
+                                uint32_t* WEBP_RESTRICT dst) {
+  int x = 0;
+  assert(xbits >= 0);
+  assert(xbits <= 3);
+  switch (xbits) {
+    case 0: {
+      const __m256i ff = _mm256_set1_epi16((short)0xff00);
+      const __m256i zero = _mm256_setzero_si256();
+      // Store 0xff000000 | (row[x] << 8).
+      for (x = 0; x + 32 <= width; x += 32, dst += 32) {
+        const __m256i in = _mm256_loadu_si256((const __m256i*)&row[x]);
+        const __m256i in_lo = _mm256_unpacklo_epi8(zero, in);
+        const __m256i dst0 = _mm256_unpacklo_epi16(in_lo, ff);
+        const __m256i dst1 = _mm256_unpackhi_epi16(in_lo, ff);
+        const __m256i in_hi = _mm256_unpackhi_epi8(zero, in);
+        const __m256i dst2 = _mm256_unpacklo_epi16(in_hi, ff);
+        const __m256i dst3 = _mm256_unpackhi_epi16(in_hi, ff);
+        _mm256_storeu2_m128i((__m128i*)&dst[16], (__m128i*)&dst[0], dst0);
+        _mm256_storeu2_m128i((__m128i*)&dst[20], (__m128i*)&dst[4], dst1);
+        _mm256_storeu2_m128i((__m128i*)&dst[24], (__m128i*)&dst[8], dst2);
+        _mm256_storeu2_m128i((__m128i*)&dst[28], (__m128i*)&dst[12], dst3);
+      }
+      break;
+    }
+    case 1: {
+      const __m256i ff = _mm256_set1_epi16((short)0xff00);
+      const __m256i mul = _mm256_set1_epi16(0x110);
+      for (x = 0; x + 32 <= width; x += 32, dst += 16) {
+        // 0a0b | (where a/b are 4 bits).
+        const __m256i in = _mm256_loadu_si256((const __m256i*)&row[x]);
+        const __m256i tmp = _mm256_mullo_epi16(in, mul);  // aba0
+        const __m256i pack = _mm256_and_si256(tmp, ff);   // ab00
+        const __m256i dst0 = _mm256_unpacklo_epi16(pack, ff);
+        const __m256i dst1 = _mm256_unpackhi_epi16(pack, ff);
+        _mm256_storeu2_m128i((__m128i*)&dst[8], (__m128i*)&dst[0], dst0);
+        _mm256_storeu2_m128i((__m128i*)&dst[12], (__m128i*)&dst[4], dst1);
+      }
+      break;
+    }
+    case 2: {
+      const __m256i mask_or = _mm256_set1_epi32((int)0xff000000);
+      const __m256i mul_cst = _mm256_set1_epi16(0x0104);
+      const __m256i mask_mul = _mm256_set1_epi16(0x0f00);
+      for (x = 0; x + 32 <= width; x += 32, dst += 8) {
+        // 000a000b000c000d | (where a/b/c/d are 2 bits).
+        const __m256i in = _mm256_loadu_si256((const __m256i*)&row[x]);
+        const __m256i mul =
+            _mm256_mullo_epi16(in, mul_cst);  // 00ab00b000cd00d0
+        const __m256i tmp =
+            _mm256_and_si256(mul, mask_mul);               //  00ab000000cd0000
+        const __m256i shift = _mm256_srli_epi32(tmp, 12);  // 00000000ab000000
+        const __m256i pack = _mm256_or_si256(shift, tmp);  // 00000000abcd0000
+        // Convert to 0xff00**00.
+        const __m256i res = _mm256_or_si256(pack, mask_or);
+        _mm256_storeu_si256((__m256i*)dst, res);
+      }
+      break;
+    }
+    default: {
+      assert(xbits == 3);
+      for (x = 0; x + 32 <= width; x += 32, dst += 4) {
+        // 0000000a00000000b... | (where a/b are 1 bit).
+        const __m256i in = _mm256_loadu_si256((const __m256i*)&row[x]);
+        const __m256i shift = _mm256_slli_epi64(in, 7);
+        const uint32_t move = _mm256_movemask_epi8(shift);
+        dst[0] = 0xff000000 | ((move & 0xff) << 8);
+        dst[1] = 0xff000000 | (move & 0xff00);
+        dst[2] = 0xff000000 | ((move & 0xff0000) >> 8);
+        dst[3] = 0xff000000 | ((move & 0xff000000) >> 16);
+      }
+      break;
+    }
+  }
+  if (x != width) {
+    VP8LBundleColorMap_SSE(row + x, width - x, xbits, dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Batch version of Predictor Transform subtraction
+
+static WEBP_INLINE void Average2_m256i(const __m256i* const a0,
+                                       const __m256i* const a1,
+                                       __m256i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m256i ones = _mm256_set1_epi8(1);
+  const __m256i avg1 = _mm256_avg_epu8(*a0, *a1);
+  const __m256i one = _mm256_and_si256(_mm256_xor_si256(*a0, *a1), ones);
+  *avg = _mm256_sub_epi8(avg1, one);
+}
+
+// Predictor0: ARGB_BLACK.
+static void PredictorSub0_AVX2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  const __m256i black = _mm256_set1_epi32((int)ARGB_BLACK);
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    const __m256i res = _mm256_sub_epi8(src, black);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_SSE[0](in + i, NULL, num_pixels - i, out + i);
+  }
+  (void)upper;
+}
+
+#define GENERATE_PREDICTOR_1(X, IN)                                          \
+  static void PredictorSub##X##_AVX2(                                        \
+      const uint32_t* const in, const uint32_t* const upper, int num_pixels, \
+      uint32_t* WEBP_RESTRICT const out) {                                   \
+    int i;                                                                   \
+    for (i = 0; i + 8 <= num_pixels; i += 8) {                               \
+      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);        \
+      const __m256i pred = _mm256_loadu_si256((const __m256i*)&(IN));        \
+      const __m256i res = _mm256_sub_epi8(src, pred);                        \
+      _mm256_storeu_si256((__m256i*)&out[i], res);                           \
+    }                                                                        \
+    if (i != num_pixels) {                                                   \
+      VP8LPredictorsSub_SSE[(X)](in + i, WEBP_OFFSET_PTR(upper, i),          \
+                                 num_pixels - i, out + i);                   \
+    }                                                                        \
+  }
+
+GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
+GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
+GENERATE_PREDICTOR_1(3, upper[i + 1])    // Predictor3: TR
+GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: avg2(avg2(L, TR), T)
+static void PredictorSub5_AVX2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i L = _mm256_loadu_si256((const __m256i*)&in[i - 1]);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]);
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    __m256i avg, pred, res;
+    Average2_m256i(&L, &TR, &avg);
+    Average2_m256i(&avg, &T, &pred);
+    res = _mm256_sub_epi8(src, pred);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_SSE[5](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_2(X, A, B)                                         \
+  static void PredictorSub##X##_AVX2(const uint32_t* in,                      \
+                                     const uint32_t* upper, int num_pixels,   \
+                                     uint32_t* WEBP_RESTRICT out) {           \
+    int i;                                                                    \
+    for (i = 0; i + 8 <= num_pixels; i += 8) {                                \
+      const __m256i tA = _mm256_loadu_si256((const __m256i*)&(A));            \
+      const __m256i tB = _mm256_loadu_si256((const __m256i*)&(B));            \
+      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);         \
+      __m256i pred, res;                                                      \
+      Average2_m256i(&tA, &tB, &pred);                                        \
+      res = _mm256_sub_epi8(src, pred);                                       \
+      _mm256_storeu_si256((__m256i*)&out[i], res);                            \
+    }                                                                         \
+    if (i != num_pixels) {                                                    \
+      VP8LPredictorsSub_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
+    }                                                                         \
+  }
+
+GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1])   // Predictor6: avg(L, TL)
+GENERATE_PREDICTOR_2(7, in[i - 1], upper[i])       // Predictor7: avg(L, T)
+GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i])    // Predictor8: avg(TL, T)
+GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: avg(avg(L,TL), avg(T, TR)).
+static void PredictorSub10_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i L = _mm256_loadu_si256((const __m256i*)&in[i - 1]);
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]);
+    __m256i avgTTR, avgLTL, avg, res;
+    Average2_m256i(&T, &TR, &avgTTR);
+    Average2_m256i(&L, &TL, &avgLTL);
+    Average2_m256i(&avgTTR, &avgLTL, &avg);
+    res = _mm256_sub_epi8(src, avg);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_SSE[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32_AVX2(const __m256i* const A, const __m256i* const B,
+                                 __m256i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m256i A_lo = _mm256_unpacklo_epi32(*A, *A);
+  const __m256i B_lo = _mm256_unpacklo_epi32(*B, *A);
+  const __m256i A_hi = _mm256_unpackhi_epi32(*A, *A);
+  const __m256i B_hi = _mm256_unpackhi_epi32(*B, *A);
+  const __m256i s_lo = _mm256_sad_epu8(A_lo, B_lo);
+  const __m256i s_hi = _mm256_sad_epu8(A_hi, B_hi);
+  *out = _mm256_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorSub11_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i L = _mm256_loadu_si256((const __m256i*)&in[i - 1]);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    __m256i pa, pb;
+    GetSumAbsDiff32_AVX2(&T, &TL, &pa);  // pa = sum |T-TL|
+    GetSumAbsDiff32_AVX2(&L, &TL, &pb);  // pb = sum |L-TL|
+    {
+      const __m256i mask = _mm256_cmpgt_epi32(pb, pa);
+      const __m256i A = _mm256_and_si256(mask, L);
+      const __m256i B = _mm256_andnot_si256(mask, T);
+      const __m256i pred = _mm256_or_si256(A, B);  // pred = (L > T)? L : T
+      const __m256i res = _mm256_sub_epi8(src, pred);
+      _mm256_storeu_si256((__m256i*)&out[i], res);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_SSE[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedSubSubtractFull.
+static void PredictorSub12_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    const __m256i L = _mm256_loadu_si256((const __m256i*)&in[i - 1]);
+    const __m256i L_lo = _mm256_unpacklo_epi8(L, zero);
+    const __m256i L_hi = _mm256_unpackhi_epi8(L, zero);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i T_lo = _mm256_unpacklo_epi8(T, zero);
+    const __m256i T_hi = _mm256_unpackhi_epi8(T, zero);
+    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero);
+    const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero);
+    const __m256i diff_lo = _mm256_sub_epi16(T_lo, TL_lo);
+    const __m256i diff_hi = _mm256_sub_epi16(T_hi, TL_hi);
+    const __m256i pred_lo = _mm256_add_epi16(L_lo, diff_lo);
+    const __m256i pred_hi = _mm256_add_epi16(L_hi, diff_hi);
+    const __m256i pred = _mm256_packus_epi16(pred_lo, pred_hi);
+    const __m256i res = _mm256_sub_epi8(src, pred);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_SSE[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictors13: ClampedAddSubtractHalf
+static void PredictorSub13_AVX2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  for (i = 0; i + 8 <= num_pixels; i += 8) {
+    const __m256i L = _mm256_loadu_si256((const __m256i*)&in[i - 1]);
+    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
+    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
+    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
+    // lo.
+    const __m256i L_lo = _mm256_unpacklo_epi8(L, zero);
+    const __m256i T_lo = _mm256_unpacklo_epi8(T, zero);
+    const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero);
+    const __m256i sum_lo = _mm256_add_epi16(T_lo, L_lo);
+    const __m256i avg_lo = _mm256_srli_epi16(sum_lo, 1);
+    const __m256i A1_lo = _mm256_sub_epi16(avg_lo, TL_lo);
+    const __m256i bit_fix_lo = _mm256_cmpgt_epi16(TL_lo, avg_lo);
+    const __m256i A2_lo = _mm256_sub_epi16(A1_lo, bit_fix_lo);
+    const __m256i A3_lo = _mm256_srai_epi16(A2_lo, 1);
+    const __m256i A4_lo = _mm256_add_epi16(avg_lo, A3_lo);
+    // hi.
+    const __m256i L_hi = _mm256_unpackhi_epi8(L, zero);
+    const __m256i T_hi = _mm256_unpackhi_epi8(T, zero);
+    const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero);
+    const __m256i sum_hi = _mm256_add_epi16(T_hi, L_hi);
+    const __m256i avg_hi = _mm256_srli_epi16(sum_hi, 1);
+    const __m256i A1_hi = _mm256_sub_epi16(avg_hi, TL_hi);
+    const __m256i bit_fix_hi = _mm256_cmpgt_epi16(TL_hi, avg_hi);
+    const __m256i A2_hi = _mm256_sub_epi16(A1_hi, bit_fix_hi);
+    const __m256i A3_hi = _mm256_srai_epi16(A2_hi, 1);
+    const __m256i A4_hi = _mm256_add_epi16(avg_hi, A3_hi);
+
+    const __m256i pred = _mm256_packus_epi16(A4_lo, A4_hi);
+    const __m256i res = _mm256_sub_epi8(src, pred);
+    _mm256_storeu_si256((__m256i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_SSE[13](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitAVX2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitAVX2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_AVX2;
+  VP8LTransformColor = TransformColor_AVX2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_AVX2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_AVX2;
+  VP8LAddVector = AddVector_AVX2;
+  VP8LAddVectorEq = AddVectorEq_AVX2;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_AVX2;
+  VP8LVectorMismatch = VectorMismatch_AVX2;
+  VP8LBundleColorMap = BundleColorMap_AVX2;
+
+  VP8LPredictorsSub[0] = PredictorSub0_AVX2;
+  VP8LPredictorsSub[1] = PredictorSub1_AVX2;
+  VP8LPredictorsSub[2] = PredictorSub2_AVX2;
+  VP8LPredictorsSub[3] = PredictorSub3_AVX2;
+  VP8LPredictorsSub[4] = PredictorSub4_AVX2;
+  VP8LPredictorsSub[5] = PredictorSub5_AVX2;
+  VP8LPredictorsSub[6] = PredictorSub6_AVX2;
+  VP8LPredictorsSub[7] = PredictorSub7_AVX2;
+  VP8LPredictorsSub[8] = PredictorSub8_AVX2;
+  VP8LPredictorsSub[9] = PredictorSub9_AVX2;
+  VP8LPredictorsSub[10] = PredictorSub10_AVX2;
+  VP8LPredictorsSub[11] = PredictorSub11_AVX2;
+  VP8LPredictorsSub[12] = PredictorSub12_AVX2;
+  VP8LPredictorsSub[13] = PredictorSub13_AVX2;
+  VP8LPredictorsSub[14] = PredictorSub0_AVX2;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_AVX2;
+}
+
+#else  // !WEBP_USE_AVX2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitAVX2)
+
+#endif  // WEBP_USE_AVX2
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -17,6 +17,7 @@

 #include <assert.h>
 #include <emmintrin.h>
+#include <string.h>

 #include "src/dsp/cpu.h"
 #include "src/dsp/lossless.h"
@ -726,6 +727,15 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
  VP8LPredictorsSub[13] = PredictorSub13_SSE2;
  VP8LPredictorsSub[14] = PredictorSub0_SSE2;  // <- padding security sentinels
  VP8LPredictorsSub[15] = PredictorSub0_SSE2;
+
+  // SSE exports for AVX and above.
+  VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE2;
+  VP8LTransformColor_SSE = TransformColor_SSE2;
+  VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE2;
+  VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE2;
+  VP8LBundleColorMap_SSE = BundleColorMap_SSE2;
+
+  memcpy(VP8LPredictorsSub_SSE, VP8LPredictorsSub, sizeof(VP8LPredictorsSub));
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/lossless_enc_sse41.c
+++ b/src/dsp/lossless_enc_sse41.c
@ -203,6 +203,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
+
+  // SSE exports for AVX and above.
+  VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE41;
+  VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE41;
+  VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@ -16,6 +16,7 @@
 #if defined(WEBP_USE_SSE2)

 #include <emmintrin.h>
+#include <string.h>

 #include "src/dsp/common_sse2.h"
 #include "src/dsp/cpu.h"
@ -710,6 +711,15 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
+
+  // SSE exports for AVX and above.
+  memcpy(VP8LPredictorsAdd_SSE, VP8LPredictorsAdd, sizeof(VP8LPredictorsAdd));
+
+  VP8LAddGreenToBlueAndRed_SSE = AddGreenToBlueAndRed_SSE2;
+  VP8LTransformColorInverse_SSE = TransformColorInverse_SSE2;
+
+  VP8LConvertBGRAToRGB_SSE = ConvertBGRAToRGB_SSE2;
+  VP8LConvertBGRAToRGBA_SSE = ConvertBGRAToRGBA_SSE2;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/dsp/lossless_sse41.c
+++ b/src/dsp/lossless_sse41.c
@ -125,6 +125,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
  VP8LTransformColorInverse = TransformColorInverse_SSE41;
  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+
+  // SSE exports for AVX and above.
+  VP8LTransformColorInverse_SSE = TransformColorInverse_SSE41;
+  VP8LConvertBGRAToRGB_SSE = ConvertBGRAToRGB_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/enc/histogram_enc.c
+++ b/src/enc/histogram_enc.c
@ -401,10 +401,8 @@ WEBP_NODISCARD static int GetCombinedHistogramEntropy(
  *cost = GetCombinedEntropy(a->literal_, b->literal_,
                             VP8LHistogramNumCodes(palette_code_bits),
                             a->is_used_[0], b->is_used_[0], 0);
-  *cost += (uint64_t)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
-                                           b->literal_ + NUM_LITERAL_CODES,
-                                           NUM_LENGTH_CODES)
-           << LOG_2_PRECISION_BITS;
+  // No need to add the extra cost as it is a constant that does not influence
+  // the histograms.
  if (*cost >= cost_threshold) return 0;

  if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
@ -434,9 +432,8 @@ WEBP_NODISCARD static int GetCombinedHistogramEntropy(

  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
                              a->is_used_[4], b->is_used_[4], 0);
-  *cost += (uint64_t)VP8LExtraCostCombined(a->distance_, b->distance_,
-                                           NUM_DISTANCE_CODES)
-           << LOG_2_PRECISION_BITS;
+  // No need to add the extra cost as it is a constant that does not influence
+  // the histograms.
  if (*cost >= cost_threshold) return 0;

  return 1;
@ -528,16 +525,13 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
  uint32_t alpha_sym, red_sym, blue_sym;
  const uint64_t alpha_cost =
      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
+  // No need to add the extra cost as it is a constant that does not influence
+  // the histograms.
  const uint64_t distance_cost =
-      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
-      ((uint64_t)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES)
-       << LOG_2_PRECISION_BITS);
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]);
  const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
  h->literal_cost_ =
-      PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
-      ((uint64_t)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
-                               NUM_LENGTH_CODES)
-       << LOG_2_PRECISION_BITS);
+      PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]);
  h->red_cost_ =
      PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
  h->blue_cost_ =