Basic version of libsharpyuv in libwebp, in C.

It's self contained apart from a dependency on src/webp/types.h and src/dsp/cpu.h For now it's only set up as an internal library, not an installable one. Webp doesn't depend on it yet, the code is only duplicated. Change-Id: I752799894f9d4105d0d296ddebd9f9641181a1ec
2026-01-30 06:59:24 +01:00 · 2022-02-18 10:19:18 +00:00
parent a30f219016
commit 29cc95ce4c
15 changed files with 1210 additions and 221 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(WEBP_BUILD_VWEBP "Build the vwebp viewer tool." ON)
 option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." ON)
 option(WEBP_BUILD_LIBWEBPMUX "Build the libwebpmux library." ON)
 option(WEBP_BUILD_WEBPMUX "Build the webpmux command line tool." ON)
+option(WEBP_BUILD_SHARPYUV "Build the sharpyuv library." OFF)
 option(WEBP_BUILD_EXTRAS "Build extras." ON)
 option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
 option(WEBP_USE_THREAD "Enable threading support" ON)
@@ -211,6 +212,19 @@ function(libwebp_add_stub_file TARGET)
  target_sources(${TARGET} PRIVATE ${stub_source_file})
 endfunction()

+if(WEBP_BUILD_SHARPYUV)
+  parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv "WEBP_SHARPYUV_SRCS"
+                    "")
+  add_library(sharpyuv OBJECT ${WEBP_SHARPYUV_SRCS})
+  target_include_directories(sharpyuv
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                             ${CMAKE_CURRENT_SOURCE_DIR})
+  set_target_properties(
+    sharpyuv
+    PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv/sharpyuv.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")
+endif()
+
 if(MSVC)
  # avoid security warnings for e.g., fopen() used in the examples.
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
--- a/Makefile.am
+++ b/Makefile.am
@@ -2,6 +2,10 @@ ACLOCAL_AMFLAGS = -I m4
 SUBDIRS = src imageio man
 EXTRA_DIST = COPYING autogen.sh

+if BUILD_SHARPYUV
+  SUBDIRS += sharpyuv
+endif
+
 if BUILD_EXTRAS
  SUBDIRS += extras
 endif
--- a/configure.ac
+++ b/configure.ac
@@ -27,7 +27,8 @@ AC_ARG_ENABLE([everything],
              AS_HELP_STRING([--enable-everything],
                             [Enable all optional targets. These can still be
                              disabled with --disable-target]),
-              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
+              [SET_IF_UNSET([enable_libsharpyuv], [$enableval])
+               SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
               SET_IF_UNSET([enable_libwebpextras], [$enableval])
               SET_IF_UNSET([enable_libwebpmux], [$enableval])])
@@ -66,6 +67,14 @@ AC_ARG_ENABLE([libwebpextras],
 AC_MSG_RESULT(${enable_libwebpextras-no})
 AM_CONDITIONAL([BUILD_EXTRAS], [test "$enable_libwebpextras" = "yes"])

+dnl === Check whether libsharpyuv should be built
+AC_MSG_CHECKING(whether libsharpyuv is to be built)
+AC_ARG_ENABLE([libsharpyuv],
+              AS_HELP_STRING([--enable-libsharpyuv],
+                             [Build libsharpyuv @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libsharpyuv-no})
+AM_CONDITIONAL([BUILD_SHARPYUV], [test "$enable_libsharpyuv" = "yes"])
+
 dnl === If --enable-asserts is not defined, define NDEBUG

 AC_MSG_CHECKING(whether asserts are enabled)
@@ -751,6 +760,7 @@ AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_HEADERS([src/webp/config.h])
 AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile extras/Makefile imageio/Makefile \
+                 sharpyuv/Makefile \
                 src/dec/Makefile src/enc/Makefile src/dsp/Makefile \
                 src/demux/Makefile src/mux/Makefile \
                 src/utils/Makefile \
@@ -772,6 +782,7 @@ libwebpdecoder: ${enable_libwebpdecoder-no}
 libwebpdemux: ${enable_libwebpdemux-no}
 libwebpmux: ${enable_libwebpmux-no}
 libwebpextras: ${enable_libwebpextras-no}
+libsharpyuv: ${enable_libsharpyuv-no}

 Tools:
 cwebp : ${enable_libwebpdemux-no}
--- a/makefile.unix
+++ b/makefile.unix
@@ -304,6 +304,7 @@ HDRS = \
    src/dec/vp8li_dec.h \
    src/dec/webpi_dec.h \
    src/dsp/common_sse2.h \
+    src/dsp/cpu.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
    src/dsp/lossless_common.h \
--- a/sharpyuv/Makefile.am
+++ b/sharpyuv/Makefile.am
@@ -0,0 +1,32 @@
+AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
+AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
+noinst_LTLIBRARIES =
+noinst_LTLIBRARIES += libsharpyuv.la
+noinst_LTLIBRARIES += libsharpyuv_sse2.la
+noinst_LTLIBRARIES += libsharpyuv_neon.la
+
+noinst_HEADERS =
+noinst_HEADERS += ../src/webp/types.h
+noinst_HEADERS += ../src/dsp/cpu.h
+
+libsharpyuv_sse2_la_SOURCES =
+libsharpyuv_sse2_la_SOURCES += sharpyuv_sse2.c
+libsharpyuv_sse2_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS)
+libsharpyuv_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
+
+libsharpyuv_neon_la_SOURCES =
+libsharpyuv_neon_la_SOURCES += sharpyuv_neon.c
+libsharpyuv_neon_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS)
+libsharpyuv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
+
+libsharpyuv_la_SOURCES =
+libsharpyuv_la_SOURCES += sharpyuv_dsp.c sharpyuv_dsp.h
+libsharpyuv_la_SOURCES += sharpyuv.c sharpyuv.h
+
+libsharpyuv_la_CPPFLAGS = $(AM_CPPFLAGS)
+libsharpyuv_la_LDFLAGS =
+libsharpyuv_la_LIBADD =
+libsharpyuv_la_LIBADD += libsharpyuv_sse2.la
+libsharpyuv_la_LIBADD += libsharpyuv_neon.la
+
+noinst_PROGRAMS =
--- a/sharpyuv/sharpyuv.c
+++ b/sharpyuv/sharpyuv.c
@@ -0,0 +1,460 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./sharpyuv.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "src/webp/types.h"
+#include "src/dsp/cpu.h"
+#include "sharpyuv/sharpyuv_dsp.h"
+
+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80      // for now we use a different gamma value than kGammaF
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+enum {
+  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
+  YUV_HALF = 1 << (YUV_FIX - 1),
+};
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+#define SFIX 2                // fixed-point precision of RGB and Y/W
+typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
+
+#define SHALF (1 << SFIX >> 1)
+#define MAX_Y_T ((256 << SFIX) - 1)
+#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
+
+// We use tables of different size and precision for the Rec709 / BT2020
+// transfer function.
+#define kGammaF (1./0.45)
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+static void InitGammaTablesS(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
+    int v;
+    const double norm = 1. / MAX_Y_T;
+    const double scale = 1. / kGammaTabSize;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+    for (v = 0; v <= MAX_Y_T; ++v) {
+      const double g = norm * v;
+      double value;
+      if (g <= thresh * 4.5) {
+        value = g / 4.5;
+      } else {
+        const double a_rec = 1. / (1. + a);
+        value = pow(a_rec * (g + a), kGammaF);
+      }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      const double g = scale * v;
+      double value;
+      if (g <= thresh) {
+        value = 4.5 * g;
+      } else {
+        value = (1. + a) * pow(g, 1. / kGammaF) - a;
+      }
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
+    }
+    // to prevent small rounding errors to cause read-overflow:
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
+  }
+}
+
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
+}
+
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
+}
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int r, int g, int b) {
+  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
+  return (luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
+  }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
+    const int W = RGBToGray(r, g, b);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
+  }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_y(v0 + W0);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
+  return ((fixed_y_t)a << SFIX) | SHALF;
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int step,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  int i;
+  const int w = (pic_width + 1) & ~1;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    dst[i + 0 * w] = UpLift(r_ptr[off]);
+    dst[i + 1 * w] = UpLift(g_ptr[off]);
+    dst[i + 2 * w] = UpLift(b_ptr[off]);
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
+                               int w,
+                               fixed_y_t* out1,
+                               fixed_y_t* out2) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+    SharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+    SharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0]);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w]);
+    }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
+  }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g - 4684 * b + SROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+                            uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
+                            int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
+                            int width, int height) {
+  int i, j;
+  const fixed_t* const best_uv_base = best_uv;
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  for (best_uv = best_uv_base, j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      dst_y[i] = ConvertRGBToY(r, g, b);
+    }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    dst_y += dst_stride_y;
+  }
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+    for (i = 0; i < uv_w; ++i) {
+      const int off = i;
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
+      dst_u[i] = ConvertRGBToU(r, g, b);
+      dst_v[i] = ConvertRGBToV(r, g, b);
+    }
+    best_uv += 3 * uv_w;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+static void* SafeMalloc(uint64_t nmemb, size_t size) {
+  const uint64_t total_size = nmemb * (uint64_t)size;
+  if (total_size != (size_t)total_size) return NULL;
+  return malloc((size_t)total_size);
+}
+
+#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))
+
+static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                            const uint8_t* b_ptr, int step, int rgb_stride,
+                            uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
+                            int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
+                            int width, int height) {
+  // we expand the right/bottom border if needed
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+  int ok;
+
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = 0;
+    goto End;
+  }
+
+  InitGammaTablesS();
+  InitSharpYuv();
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < height; j += 2) {
+    const int is_last_row = (j == height - 1);
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr, g_ptr, b_ptr, step, width, src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+                   step, width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w);
+    UpdateW(src2, target_y + w, w);
+    UpdateChroma(src1, src2, target_uv, uv_w);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w);
+      UpdateW(src2, best_rgb_y + 1 * w, w);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
+
+      // update two rows of Y and one row of RGB
+      diff_y_sum += SharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+      SharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
+    }
+    // test exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, dst_y, dst_stride_y, dst_u,
+                        dst_stride_u, dst_v, dst_stride_v, width, height);
+
+ End:
+  free(best_y_base);
+  free(best_uv_base);
+  free(target_y_base);
+  free(target_uv_base);
+  free(best_rgb_y);
+  free(best_rgb_uv);
+  free(tmp_buffer);
+  return ok;
+ }
+#undef SAFE_ALLOC
+
+int SharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                   const uint8_t* b_ptr, int step, int rgb_stride,
+                   uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
+                   int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
+                   int width, int height) {
+  if (width < kMinDimensionIterativeConversion ||
+      height < kMinDimensionIterativeConversion) {
+    return 0;
+  }
+  return DoSharpArgbToYuv(
+      r_ptr, g_ptr, b_ptr, step, rgb_stride, dst_y, dst_stride_y, dst_u,
+      dst_stride_u, dst_v, dst_stride_v, width, height);
+}
+
+//------------------------------------------------------------------------------
--- a/sharpyuv/sharpyuv.h
+++ b/sharpyuv/sharpyuv.h
@@ -0,0 +1,40 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_H_
+#define WEBP_SHARPYUV_SHARPYUV_H_
+
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Converts RGB to YUV420 using a downsampling algorithm that minimizes
+// artefacts caused by chroma subsampling.
+// This is slower than standard downsampling (averaging of 4 UV values).
+// Assumes that the image will be upsampled using a bilinear filter. If nearest
+// neighbor is used instead, the upsampled image might look worse than with
+// standard downsampling.
+// TODO(maryla): add 10 bits and handling of various colorspaces. Add YUV444 to
+// YUV420 conversion. Maybe also add 422 support (it's rarely used in practice,
+// especially for images).
+int SharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                   const uint8_t* b_ptr, int step, int rgb_stride,
+                   uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
+                   int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_H_
--- a/sharpyuv/sharpyuv_dsp.c
+++ b/sharpyuv/sharpyuv_dsp.c
@@ -0,0 +1,101 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "src/dsp/cpu.h"
+
+//-----------------------------------------------------------------------------
+
+#if !WEBP_NEON_OMIT_C_CODE
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
+                                  uint16_t* dst, int len) {
+  uint64_t diff = 0;
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
+                                int16_t* dst, int len) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
+                                const uint16_t* best_y, uint16_t* out) {
+  int i;
+  for (i = 0; i < len; ++i, ++A, ++B) {
+    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+#undef MAX_Y
+
+//-----------------------------------------------------------------------------
+
+uint64_t (*SharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
+                            uint16_t* dst, int len);
+void (*SharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
+                          int len);
+void (*SharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
+                          const uint16_t* best_y, uint16_t* out);
+
+extern void InitSharpYUVSSE2(void);
+extern void InitSharpYUVNEON(void);
+
+WEBP_DSP_INIT_FUNC(InitSharpYuv) {
+
+#if !WEBP_NEON_OMIT_C_CODE
+  SharpYUVUpdateY = SharpYUVUpdateY_C;
+  SharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
+  SharpYUVFilterRow = SharpYUVFilterRow_C;
+#endif
+
+#if defined(WEBP_HAVE_SSE2)
+  if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSSE2)) {
+    InitSharpYUVSSE2();
+  }
+#endif  // WEBP_HAVE_SSE2
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    InitSharpYUVNEON();
+  }
+#endif  // WEBP_HAVE_NEON
+
+  assert(SharpYUVUpdateY != NULL);
+  assert(SharpYUVUpdateRGB != NULL);
+  assert(SharpYUVFilterRow != NULL);
+}
--- a/sharpyuv/sharpyuv_dsp.h
+++ b/sharpyuv/sharpyuv_dsp.h
@@ -0,0 +1,28 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+
+#ifndef WEBP_SHARPYUV_DSP_H_
+#define WEBP_SHARPYUV_DSP_H_
+
+#include <stdint.h>
+
+#include "src/dsp/cpu.h"
+
+extern uint64_t (*SharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
+                                   uint16_t* dst, int len);
+extern void (*SharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
+                                 int16_t* dst, int len);
+extern void (*SharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
+                                 const uint16_t* best_y, uint16_t* out);
+
+void InitSharpYuv(void);
+
+#endif  // WEBP_SHARPYUV_DSP_H_
--- a/sharpyuv/sharpyuv_neon.c
+++ b/sharpyuv/sharpyuv_neon.c
@@ -0,0 +1,133 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_NEON)
+#include <assert.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+#endif
+
+extern void InitSharpYUVNEON(void);
+
+#if defined(WEBP_USE_NEON)
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y_NEON(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_y_NEON(new_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t d0 = vaddq_s16(c1, a0);
+    const int16x8_t d1 = vaddq_s16(c0, a1);
+    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
+    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
+  }
+}
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYUVNEON(void) {
+  SharpYUVUpdateY = SharpYUVUpdateY_NEON;
+  SharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
+  SharpYUVFilterRow = SharpYUVFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+void InitSharpYUVNEON(void) {}
+
+#endif  // WEBP_USE_NEON
--- a/sharpyuv/sharpyuv_sse2.c
+++ b/sharpyuv/sharpyuv_sse2.c
@@ -0,0 +1,143 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>
+#include <emmintrin.h>
+#endif
+
+extern void InitSharpYUVSSE2(void);
+
+#if defined(WEBP_USE_SSE2)
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  uint64_t diff = 0;
+  uint32_t tmp[4];
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16(MAX_Y);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i sum = zero;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
+    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
+    const __m128i F = _mm_add_epi16(C, D);       // new_y
+    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
+    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
+    _mm_storeu_si128((__m128i*)(dst + i), H);
+    sum = _mm_add_epi32(sum, I);
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i = 0;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
+    const __m128i E = _mm_add_epi16(C, D);   // new_uv
+    _mm_storeu_si128((__m128i*)(dst + i), E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const __m128i kCst8 = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(MAX_Y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+    const __m128i a0b1 = _mm_add_epi16(a0, b1);
+    const __m128i a1b0 = _mm_add_epi16(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi16(c1, a0);
+    const __m128i d1 = _mm_add_epi16(c0, a1);
+    const __m128i e0 = _mm_srai_epi16(d0, 1);
+    const __m128i e1 = _mm_srai_epi16(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+    const __m128i h0 = _mm_add_epi16(g0, f0);
+    const __m128i h1 = _mm_add_epi16(g1, f1);
+    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYUVSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYUVSSE2(void) {
+  SharpYUVUpdateY = SharpYUVUpdateY_SSE2;
+  SharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
+  SharpYUVFilterRow = SharpYUVFilterRow_SSE2;
+}
+#else  // !WEBP_USE_SSE2
+
+void InitSharpYUVSSE2(void) {}
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -24,6 +24,7 @@ commondir = $(includedir)/webp
 COMMON_SOURCES =
 COMMON_SOURCES += alpha_processing.c
 COMMON_SOURCES += cpu.c
+COMMON_SOURCES += cpu.h
 COMMON_SOURCES += dec.c
 COMMON_SOURCES += dec_clip_tables.c
 COMMON_SOURCES += dsp.h
--- a/src/dsp/cpu.h
+++ b/src/dsp/cpu.h
@@ -0,0 +1,239 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   CPU detection functions and macros.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_CPU_H_
+#define WEBP_DSP_CPU_H_
+
+#ifdef HAVE_CONFIG_H
+#include "src/webp/config.h"
+#endif
+
+#include "src/webp/types.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#if defined(__clang__)
+#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+#define LOCAL_CLANG_PREREQ(maj, min) \
+  (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_CLANG_VERSION 0
+#define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if !defined(HAVE_CONFIG_H)
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
+#define WEBP_USE_SSE41
+#endif
+
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) &&       \
+     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
+    !defined(__native_client__)
+#define WEBP_USE_NEON
+#endif
+
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
+// vtbl4_u8(); a fix was made in 16.6.
+#if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+                          (_MSC_VER >= 1926 && defined(_M_ARM64)))
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \
+    (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define WEBP_USE_MSA
+#endif
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \
+      defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
+#endif
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define WEBP_DSP_INIT(func)                                         \
+  do {                                                              \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used =           \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;                      \
+    static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&func##_lock)) break;                    \
+    if (func##_last_cpuinfo_used != VP8GetCPUInfo) func();          \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;                       \
+    (void)pthread_mutex_unlock(&func##_lock);                       \
+  } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define WEBP_DSP_INIT(func)                               \
+  do {                                                    \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used = \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;            \
+    if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \
+    func();                                               \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;             \
+  } while (0)
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Defines an Init + helper function that control multiple initialization of
+// function pointers / tables.
+/* Usage:
+   WEBP_DSP_INIT_FUNC(InitFunc) {
+     ...function body
+   }
+*/
+#define WEBP_DSP_INIT_FUNC(name)                                            \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void);                  \
+  WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void)
+
+#define WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures. This is only meant to silence unaligned loads on platforms that
+// are known to support them.
+#undef WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined")))
+
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures related to unsigned integer overflows. This is only meant to
+// silence cases where this well defined behavior is expected.
+#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) &&                   \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kSlowSSSE3,  // special feature for slow SSSE3 architectures
+  kSSE4_1,
+  kAVX,
+  kAVX2,
+  kNEON,
+  kMIPS32,
+  kMIPSdspR2,
+  kMSA
+} CPUFeature;
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
+
+#endif  // WEBP_DSP_CPU_H_
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -18,6 +18,7 @@
 #include "src/webp/config.h"
 #endif

+#include "src/dsp/cpu.h"
 #include "src/webp/types.h"

 #ifdef __cplusplus
@@ -43,226 +44,6 @@ extern "C" {
 #define WEBP_RESTRICT
 #endif

-//------------------------------------------------------------------------------
-// CPU detection
-
-#if defined(__GNUC__)
-# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
-# define LOCAL_GCC_PREREQ(maj, min) \
-    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_GCC_VERSION 0
-# define LOCAL_GCC_PREREQ(maj, min) 0
-#endif
-
-#if defined(__clang__)
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif
-
-#ifndef __has_builtin
-# define __has_builtin(x) 0
-#endif
-
-#if !defined(HAVE_CONFIG_H)
-#if defined(_MSC_VER) && _MSC_VER > 1310 && \
-    (defined(_M_X64) || defined(_M_IX86))
-#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
-    (defined(_M_X64) || defined(_M_IX86))
-#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
-#endif
-#endif
-
-// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
-// files without intrinsics, allowing the corresponding Init() to be called.
-// Files containing intrinsics will need to be built targeting the instruction
-// set so should succeed on one of the earlier tests.
-#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
-    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
-#define WEBP_USE_SSE2
-#endif
-
-#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
-#define WEBP_HAVE_SSE2
-#endif
-
-#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
-    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
-#define WEBP_USE_SSE41
-#endif
-
-#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
-#define WEBP_HAVE_SSE41
-#endif
-
-#undef WEBP_MSC_SSE41
-#undef WEBP_MSC_SSE2
-
-// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
-// inline assembly would need to be modified for use with Native Client.
-#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
-     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
-    !defined(__native_client__)
-#define WEBP_USE_NEON
-#endif
-
-#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
-    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
-#define WEBP_ANDROID_NEON  // Android targets that may have NEON
-#define WEBP_USE_NEON
-#endif
-
-// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
-// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
-// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
-// vtbl4_u8(); a fix was made in 16.6.
-#if defined(_MSC_VER) && \
-  ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
-   (_MSC_VER >= 1926 && defined(_M_ARM64)))
-#define WEBP_USE_NEON
-#define WEBP_USE_INTRINSICS
-#endif
-
-#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
-#define WEBP_HAVE_NEON
-#endif
-
-#if defined(__mips__) && !defined(__mips64) && \
-    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
-#define WEBP_USE_MIPS32
-#if (__mips_isa_rev >= 2)
-#define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
-#define WEBP_USE_MIPS_DSP_R2
-#endif
-#endif
-#endif
-
-#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
-#define WEBP_USE_MSA
-#endif
-
-#ifndef WEBP_DSP_OMIT_C_CODE
-#define WEBP_DSP_OMIT_C_CODE 1
-#endif
-
-#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
-#define WEBP_NEON_OMIT_C_CODE 1
-#else
-#define WEBP_NEON_OMIT_C_CODE 0
-#endif
-
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
-#define WEBP_NEON_WORK_AROUND_GCC 1
-#else
-#define WEBP_NEON_WORK_AROUND_GCC 0
-#endif
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#define WEBP_TSAN_IGNORE_FUNCTION
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#undef WEBP_TSAN_IGNORE_FUNCTION
-#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
-#endif
-#endif
-
-#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
-#include <pthread.h>  // NOLINT
-
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
-  if (pthread_mutex_lock(&func ## _lock)) break;                    \
-  if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func();          \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-  (void)pthread_mutex_unlock(&func ## _lock);                       \
-} while (0)
-#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break;           \
-  func();                                                           \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-} while (0)
-#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
-
-// Defines an Init + helper function that control multiple initialization of
-// function pointers / tables.
-/* Usage:
-   WEBP_DSP_INIT_FUNC(InitFunc) {
-     ...function body
-   }
-*/
-#define WEBP_DSP_INIT_FUNC(name)                             \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
-  WEBP_TSAN_IGNORE_FUNCTION void name(void) {                \
-    WEBP_DSP_INIT(name ## _body);                            \
-  }                                                          \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
-
-#define WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#if defined(__clang__) && defined(__has_attribute)
-#if __has_attribute(no_sanitize)
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures. This is only meant to silence unaligned loads on platforms that
-// are known to support them.
-#undef WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNDEF \
-  __attribute__((no_sanitize("undefined")))
-
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures related to unsigned integer overflows. This is only meant to
-// silence cases where this well defined behavior is expected.
-#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
-  __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
-#endif
-
-// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
-// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
-#if !defined(WEBP_OFFSET_PTR)
-#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
-#endif
-
-// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
-#if !defined(WEBP_SWAP_16BIT_CSP)
-#define WEBP_SWAP_16BIT_CSP 0
-#endif
-
-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
-typedef enum {
-  kSSE2,
-  kSSE3,
-  kSlowSSSE3,  // special feature for slow SSSE3 architectures
-  kSSE4_1,
-  kAVX,
-  kAVX2,
-  kNEON,
-  kMIPS32,
-  kMIPSdspR2,
-  kMSA
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;

 //------------------------------------------------------------------------------
 // Init stub generator
--- a/src/utils/Makefile.am
+++ b/src/utils/Makefile.am
@@ -9,6 +9,7 @@ common_HEADERS = ../webp/types.h
 commondir = $(includedir)/webp

 noinst_HEADERS =
+noinst_HEADERS += ../dsp/cpu.h
 noinst_HEADERS += ../dsp/dsp.h
 noinst_HEADERS += ../webp/decode.h
 noinst_HEADERS += ../webp/encode.h