diff --git a/CMakeLists.txt b/CMakeLists.txt index 9610eaa0..e373a7ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(WEBP_BUILD_VWEBP "Build the vwebp viewer tool." ON) option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." ON) option(WEBP_BUILD_LIBWEBPMUX "Build the libwebpmux library." ON) option(WEBP_BUILD_WEBPMUX "Build the webpmux command line tool." ON) +option(WEBP_BUILD_SHARPYUV "Build the sharpyuv library." OFF) option(WEBP_BUILD_EXTRAS "Build extras." ON) option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF) option(WEBP_USE_THREAD "Enable threading support" ON) @@ -211,6 +212,19 @@ function(libwebp_add_stub_file TARGET) target_sources(${TARGET} PRIVATE ${stub_source_file}) endfunction() +if(WEBP_BUILD_SHARPYUV) + parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv "WEBP_SHARPYUV_SRCS" + "") + add_library(sharpyuv OBJECT ${WEBP_SHARPYUV_SRCS}) + target_include_directories(sharpyuv + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) + set_target_properties( + sharpyuv + PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv/sharpyuv.h;\ +${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h") +endif() + if(MSVC) # avoid security warnings for e.g., fopen() used in the examples. add_definitions(-D_CRT_SECURE_NO_WARNINGS) diff --git a/Makefile.am b/Makefile.am index e415f6f1..3ab4d29f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,6 +2,10 @@ ACLOCAL_AMFLAGS = -I m4 SUBDIRS = src imageio man EXTRA_DIST = COPYING autogen.sh +if BUILD_SHARPYUV + SUBDIRS += sharpyuv +endif + if BUILD_EXTRAS SUBDIRS += extras endif diff --git a/configure.ac b/configure.ac index 1047ec29..7384d1f9 100644 --- a/configure.ac +++ b/configure.ac @@ -27,7 +27,8 @@ AC_ARG_ENABLE([everything], AS_HELP_STRING([--enable-everything], [Enable all optional targets. These can still be disabled with --disable-target]), - [SET_IF_UNSET([enable_libwebpdecoder], [$enableval]) + [SET_IF_UNSET([enable_libsharpyuv], [$enableval]) + SET_IF_UNSET([enable_libwebpdecoder], [$enableval]) SET_IF_UNSET([enable_libwebpdemux], [$enableval]) SET_IF_UNSET([enable_libwebpextras], [$enableval]) SET_IF_UNSET([enable_libwebpmux], [$enableval])]) @@ -66,6 +67,14 @@ AC_ARG_ENABLE([libwebpextras], AC_MSG_RESULT(${enable_libwebpextras-no}) AM_CONDITIONAL([BUILD_EXTRAS], [test "$enable_libwebpextras" = "yes"]) +dnl === Check whether libsharpyuv should be built +AC_MSG_CHECKING(whether libsharpyuv is to be built) +AC_ARG_ENABLE([libsharpyuv], + AS_HELP_STRING([--enable-libsharpyuv], + [Build libsharpyuv @<:@default=no@:>@])) +AC_MSG_RESULT(${enable_libsharpyuv-no}) +AM_CONDITIONAL([BUILD_SHARPYUV], [test "$enable_libsharpyuv" = "yes"]) + dnl === If --enable-asserts is not defined, define NDEBUG AC_MSG_CHECKING(whether asserts are enabled) @@ -751,6 +760,7 @@ AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_HEADERS([src/webp/config.h]) AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \ examples/Makefile extras/Makefile imageio/Makefile \ + sharpyuv/Makefile \ src/dec/Makefile src/enc/Makefile src/dsp/Makefile \ src/demux/Makefile src/mux/Makefile \ src/utils/Makefile \ @@ -772,6 +782,7 @@ libwebpdecoder: ${enable_libwebpdecoder-no} libwebpdemux: ${enable_libwebpdemux-no} libwebpmux: ${enable_libwebpmux-no} libwebpextras: ${enable_libwebpextras-no} +libsharpyuv: ${enable_libsharpyuv-no} Tools: cwebp : ${enable_libwebpdemux-no} diff --git a/makefile.unix b/makefile.unix index 82c4d411..ee31aa6c 100644 --- a/makefile.unix +++ b/makefile.unix @@ -304,6 +304,7 @@ HDRS = \ src/dec/vp8li_dec.h \ src/dec/webpi_dec.h \ src/dsp/common_sse2.h \ + src/dsp/cpu.h \ src/dsp/dsp.h \ src/dsp/lossless.h \ src/dsp/lossless_common.h \ diff --git a/sharpyuv/Makefile.am b/sharpyuv/Makefile.am new file mode 100644 index 00000000..fa862e37 --- /dev/null +++ b/sharpyuv/Makefile.am @@ -0,0 +1,32 @@ +AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir) +AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src +noinst_LTLIBRARIES = +noinst_LTLIBRARIES += libsharpyuv.la +noinst_LTLIBRARIES += libsharpyuv_sse2.la +noinst_LTLIBRARIES += libsharpyuv_neon.la + +noinst_HEADERS = +noinst_HEADERS += ../src/webp/types.h +noinst_HEADERS += ../src/dsp/cpu.h + +libsharpyuv_sse2_la_SOURCES = +libsharpyuv_sse2_la_SOURCES += sharpyuv_sse2.c +libsharpyuv_sse2_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS) +libsharpyuv_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS) + +libsharpyuv_neon_la_SOURCES = +libsharpyuv_neon_la_SOURCES += sharpyuv_neon.c +libsharpyuv_neon_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS) +libsharpyuv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS) + +libsharpyuv_la_SOURCES = +libsharpyuv_la_SOURCES += sharpyuv_dsp.c sharpyuv_dsp.h +libsharpyuv_la_SOURCES += sharpyuv.c sharpyuv.h + +libsharpyuv_la_CPPFLAGS = $(AM_CPPFLAGS) +libsharpyuv_la_LDFLAGS = +libsharpyuv_la_LIBADD = +libsharpyuv_la_LIBADD += libsharpyuv_sse2.la +libsharpyuv_la_LIBADD += libsharpyuv_neon.la + +noinst_PROGRAMS = diff --git a/sharpyuv/sharpyuv.c b/sharpyuv/sharpyuv.c new file mode 100644 index 00000000..5376f6be --- /dev/null +++ b/sharpyuv/sharpyuv.c @@ -0,0 +1,460 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Sharp RGB to YUV conversion. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./sharpyuv.h" + +#include +#include +#include +#include + +#include "src/webp/types.h" +#include "src/dsp/cpu.h" +#include "sharpyuv/sharpyuv_dsp.h" + +//------------------------------------------------------------------------------ +// Code for gamma correction + +// gamma-compensates loss of resolution during chroma subsampling +#define kGamma 0.80 // for now we use a different gamma value than kGammaF +#define kGammaFix 12 // fixed-point precision for linear values +#define kGammaScale ((1 << kGammaFix) - 1) +#define kGammaTabFix 7 // fixed-point fractional bits precision +#define kGammaTabScale (1 << kGammaTabFix) +#define kGammaTabRounder (kGammaTabScale >> 1) +#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix)) + +enum { + YUV_FIX = 16, // fixed-point precision for RGB->YUV + YUV_HALF = 1 << (YUV_FIX - 1), +}; + +//------------------------------------------------------------------------------ +// Sharp RGB->YUV conversion + +static const int kNumIterations = 4; +static const int kMinDimensionIterativeConversion = 4; + +// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some +// banding sometimes. Better use extra precision. +#define SFIX 2 // fixed-point precision of RGB and Y/W +typedef int16_t fixed_t; // signed type with extra SFIX precision for UV +typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W + +#define SHALF (1 << SFIX >> 1) +#define MAX_Y_T ((256 << SFIX) - 1) +#define SROUNDER (1 << (YUV_FIX + SFIX - 1)) + +// We use tables of different size and precision for the Rec709 / BT2020 +// transfer function. +#define kGammaF (1./0.45) +static uint32_t kLinearToGammaTabS[kGammaTabSize + 2]; +#define GAMMA_TO_LINEAR_BITS 14 +static uint32_t kGammaToLinearTabS[MAX_Y_T + 1]; // size scales with Y_FIX +static volatile int kGammaTablesSOk = 0; +static void InitGammaTablesS(void); + +WEBP_DSP_INIT_FUNC(InitGammaTablesS) { + assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values + if (!kGammaTablesSOk) { + int v; + const double norm = 1. / MAX_Y_T; + const double scale = 1. / kGammaTabSize; + const double a = 0.09929682680944; + const double thresh = 0.018053968510807; + const double final_scale = 1 << GAMMA_TO_LINEAR_BITS; + for (v = 0; v <= MAX_Y_T; ++v) { + const double g = norm * v; + double value; + if (g <= thresh * 4.5) { + value = g / 4.5; + } else { + const double a_rec = 1. / (1. + a); + value = pow(a_rec * (g + a), kGammaF); + } + kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5); + } + for (v = 0; v <= kGammaTabSize; ++v) { + const double g = scale * v; + double value; + if (g <= thresh) { + value = 4.5 * g; + } else { + value = (1. + a) * pow(g, 1. / kGammaF) - a; + } + // we already incorporate the 1/2 rounding constant here + kLinearToGammaTabS[v] = + (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1); + } + // to prevent small rounding errors to cause read-overflow: + kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize]; + kGammaTablesSOk = 1; + } +} + +// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS +static WEBP_INLINE uint32_t GammaToLinearS(int v) { + return kGammaToLinearTabS[v]; +} + +static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) { + // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision + const uint32_t v = value * kGammaTabSize; + const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS; + // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision + const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part + // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1]) + const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0]; + const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1]; + // Final interpolation. Note that rounding is already included. + const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0. + const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS); + return result; +} + +//------------------------------------------------------------------------------ + +static uint8_t clip_8b(fixed_t v) { + return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u; +} + +static fixed_y_t clip_y(int y) { + return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T; +} + +//------------------------------------------------------------------------------ + +static int RGBToGray(int r, int g, int b) { + const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF; + return (luma >> YUV_FIX); +} + +static uint32_t ScaleDown(int a, int b, int c, int d) { + const uint32_t A = GammaToLinearS(a); + const uint32_t B = GammaToLinearS(b); + const uint32_t C = GammaToLinearS(c); + const uint32_t D = GammaToLinearS(d); + return LinearToGammaS((A + B + C + D + 2) >> 2); +} + +static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) { + int i; + for (i = 0; i < w; ++i) { + const uint32_t R = GammaToLinearS(src[0 * w + i]); + const uint32_t G = GammaToLinearS(src[1 * w + i]); + const uint32_t B = GammaToLinearS(src[2 * w + i]); + const uint32_t Y = RGBToGray(R, G, B); + dst[i] = (fixed_y_t)LinearToGammaS(Y); + } +} + +static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2, + fixed_t* dst, int uv_w) { + int i; + for (i = 0; i < uv_w; ++i) { + const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], + src2[0 * uv_w + 0], src2[0 * uv_w + 1]); + const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], + src2[2 * uv_w + 0], src2[2 * uv_w + 1]); + const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], + src2[4 * uv_w + 0], src2[4 * uv_w + 1]); + const int W = RGBToGray(r, g, b); + dst[0 * uv_w] = (fixed_t)(r - W); + dst[1 * uv_w] = (fixed_t)(g - W); + dst[2 * uv_w] = (fixed_t)(b - W); + dst += 1; + src1 += 2; + src2 += 2; + } +} + +static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) { + int i; + for (i = 0; i < w; ++i) { + y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]); + } +} + +//------------------------------------------------------------------------------ + +static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) { + const int v0 = (A * 3 + B + 2) >> 2; + return clip_y(v0 + W0); +} + +//------------------------------------------------------------------------------ + +static WEBP_INLINE fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX + return ((fixed_y_t)a << SFIX) | SHALF; +} + +static void ImportOneRow(const uint8_t* const r_ptr, + const uint8_t* const g_ptr, + const uint8_t* const b_ptr, + int step, + int pic_width, + fixed_y_t* const dst) { + int i; + const int w = (pic_width + 1) & ~1; + for (i = 0; i < pic_width; ++i) { + const int off = i * step; + dst[i + 0 * w] = UpLift(r_ptr[off]); + dst[i + 1 * w] = UpLift(g_ptr[off]); + dst[i + 2 * w] = UpLift(b_ptr[off]); + } + if (pic_width & 1) { // replicate rightmost pixel + dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1]; + dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1]; + dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1]; + } +} + +static void InterpolateTwoRows(const fixed_y_t* const best_y, + const fixed_t* prev_uv, + const fixed_t* cur_uv, + const fixed_t* next_uv, + int w, + fixed_y_t* out1, + fixed_y_t* out2) { + const int uv_w = w >> 1; + const int len = (w - 1) >> 1; // length to filter + int k = 3; + while (k-- > 0) { // process each R/G/B segments in turn + // special boundary case for i==0 + out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]); + out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]); + + SharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1); + SharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1); + + // special boundary case for i == w - 1 when w is even + if (!(w & 1)) { + out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1], + best_y[w - 1 + 0]); + out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1], + best_y[w - 1 + w]); + } + out1 += w; + out2 += w; + prev_uv += uv_w; + cur_uv += uv_w; + next_uv += uv_w; + } +} + +static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) { + const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER; + return clip_8b(16 + (luma >> (YUV_FIX + SFIX))); +} + +static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) { + const int u = -9719 * r - 19081 * g + 28800 * b + SROUNDER; + return clip_8b(128 + (u >> (YUV_FIX + SFIX))); +} + +static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) { + const int v = +28800 * r - 24116 * g - 4684 * b + SROUNDER; + return clip_8b(128 + (v >> (YUV_FIX + SFIX))); +} + +static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv, + uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, + int dst_stride_u, uint8_t* dst_v, int dst_stride_v, + int width, int height) { + int i, j; + const fixed_t* const best_uv_base = best_uv; + const int w = (width + 1) & ~1; + const int h = (height + 1) & ~1; + const int uv_w = w >> 1; + const int uv_h = h >> 1; + for (best_uv = best_uv_base, j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) { + const int off = (i >> 1); + const int W = best_y[i]; + const int r = best_uv[off + 0 * uv_w] + W; + const int g = best_uv[off + 1 * uv_w] + W; + const int b = best_uv[off + 2 * uv_w] + W; + dst_y[i] = ConvertRGBToY(r, g, b); + } + best_y += w; + best_uv += (j & 1) * 3 * uv_w; + dst_y += dst_stride_y; + } + for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) { + for (i = 0; i < uv_w; ++i) { + const int off = i; + const int r = best_uv[off + 0 * uv_w]; + const int g = best_uv[off + 1 * uv_w]; + const int b = best_uv[off + 2 * uv_w]; + dst_u[i] = ConvertRGBToU(r, g, b); + dst_v[i] = ConvertRGBToV(r, g, b); + } + best_uv += 3 * uv_w; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 1; +} + +//------------------------------------------------------------------------------ +// Main function + +static void* SafeMalloc(uint64_t nmemb, size_t size) { + const uint64_t total_size = nmemb * (uint64_t)size; + if (total_size != (size_t)total_size) return NULL; + return malloc((size_t)total_size); +} + +#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T))) + +static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, + const uint8_t* b_ptr, int step, int rgb_stride, + uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, + int dst_stride_u, uint8_t* dst_v, int dst_stride_v, + int width, int height) { + // we expand the right/bottom border if needed + const int w = (width + 1) & ~1; + const int h = (height + 1) & ~1; + const int uv_w = w >> 1; + const int uv_h = h >> 1; + uint64_t prev_diff_y_sum = ~0; + int j, iter; + + // TODO(skal): allocate one big memory chunk. But for now, it's easier + // for valgrind debugging to have several chunks. + fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch + fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t); + fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t); + fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t); + fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t); + fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t); + fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t); + fixed_y_t* best_y = best_y_base; + fixed_y_t* target_y = target_y_base; + fixed_t* best_uv = best_uv_base; + fixed_t* target_uv = target_uv_base; + const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h); + int ok; + + if (best_y_base == NULL || best_uv_base == NULL || + target_y_base == NULL || target_uv_base == NULL || + best_rgb_y == NULL || best_rgb_uv == NULL || + tmp_buffer == NULL) { + ok = 0; + goto End; + } + + InitGammaTablesS(); + InitSharpYuv(); + + // Import RGB samples to W/RGB representation. + for (j = 0; j < height; j += 2) { + const int is_last_row = (j == height - 1); + fixed_y_t* const src1 = tmp_buffer + 0 * w; + fixed_y_t* const src2 = tmp_buffer + 3 * w; + + // prepare two rows of input + ImportOneRow(r_ptr, g_ptr, b_ptr, step, width, src1); + if (!is_last_row) { + ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride, + step, width, src2); + } else { + memcpy(src2, src1, 3 * w * sizeof(*src2)); + } + StoreGray(src1, best_y + 0, w); + StoreGray(src2, best_y + w, w); + + UpdateW(src1, target_y, w); + UpdateW(src2, target_y + w, w); + UpdateChroma(src1, src2, target_uv, uv_w); + memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv)); + best_y += 2 * w; + best_uv += 3 * uv_w; + target_y += 2 * w; + target_uv += 3 * uv_w; + r_ptr += 2 * rgb_stride; + g_ptr += 2 * rgb_stride; + b_ptr += 2 * rgb_stride; + } + + // Iterate and resolve clipping conflicts. + for (iter = 0; iter < kNumIterations; ++iter) { + const fixed_t* cur_uv = best_uv_base; + const fixed_t* prev_uv = best_uv_base; + uint64_t diff_y_sum = 0; + + best_y = best_y_base; + best_uv = best_uv_base; + target_y = target_y_base; + target_uv = target_uv_base; + for (j = 0; j < h; j += 2) { + fixed_y_t* const src1 = tmp_buffer + 0 * w; + fixed_y_t* const src2 = tmp_buffer + 3 * w; + { + const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0); + InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2); + prev_uv = cur_uv; + cur_uv = next_uv; + } + + UpdateW(src1, best_rgb_y + 0 * w, w); + UpdateW(src2, best_rgb_y + 1 * w, w); + UpdateChroma(src1, src2, best_rgb_uv, uv_w); + + // update two rows of Y and one row of RGB + diff_y_sum += SharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w); + SharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w); + + best_y += 2 * w; + best_uv += 3 * uv_w; + target_y += 2 * w; + target_uv += 3 * uv_w; + } + // test exit condition + if (iter > 0) { + if (diff_y_sum < diff_y_threshold) break; + if (diff_y_sum > prev_diff_y_sum) break; + } + prev_diff_y_sum = diff_y_sum; + } + // final reconstruction + ok = ConvertWRGBToYUV(best_y_base, best_uv_base, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height); + + End: + free(best_y_base); + free(best_uv_base); + free(target_y_base); + free(target_uv_base); + free(best_rgb_y); + free(best_rgb_uv); + free(tmp_buffer); + return ok; + } +#undef SAFE_ALLOC + +int SharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, + const uint8_t* b_ptr, int step, int rgb_stride, + uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, + int dst_stride_u, uint8_t* dst_v, int dst_stride_v, + int width, int height) { + if (width < kMinDimensionIterativeConversion || + height < kMinDimensionIterativeConversion) { + return 0; + } + return DoSharpArgbToYuv( + r_ptr, g_ptr, b_ptr, step, rgb_stride, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height); +} + +//------------------------------------------------------------------------------ diff --git a/sharpyuv/sharpyuv.h b/sharpyuv/sharpyuv.h new file mode 100644 index 00000000..51aaf482 --- /dev/null +++ b/sharpyuv/sharpyuv.h @@ -0,0 +1,40 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Sharp RGB to YUV conversion. + +#ifndef WEBP_SHARPYUV_SHARPYUV_H_ +#define WEBP_SHARPYUV_SHARPYUV_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Converts RGB to YUV420 using a downsampling algorithm that minimizes +// artefacts caused by chroma subsampling. +// This is slower than standard downsampling (averaging of 4 UV values). +// Assumes that the image will be upsampled using a bilinear filter. If nearest +// neighbor is used instead, the upsampled image might look worse than with +// standard downsampling. +// TODO(maryla): add 10 bits and handling of various colorspaces. Add YUV444 to +// YUV420 conversion. Maybe also add 422 support (it's rarely used in practice, +// especially for images). +int SharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, + const uint8_t* b_ptr, int step, int rgb_stride, + uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, + int dst_stride_u, uint8_t* dst_v, int dst_stride_v, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // WEBP_SHARPYUV_SHARPYUV_H_ diff --git a/sharpyuv/sharpyuv_dsp.c b/sharpyuv/sharpyuv_dsp.c new file mode 100644 index 00000000..15e88327 --- /dev/null +++ b/sharpyuv/sharpyuv_dsp.c @@ -0,0 +1,101 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Speed-critical functions for Sharp YUV. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "sharpyuv/sharpyuv_dsp.h" + +#include +#include + +#include "src/dsp/cpu.h" + +//----------------------------------------------------------------------------- + +#if !WEBP_NEON_OMIT_C_CODE +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + uint64_t diff = 0; + int i; + for (i = 0; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)dst[i] + diff_y; + dst[i] = clip_y(new_y); + diff += (uint64_t)abs(diff_y); + } + return diff; +} + +static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i; + for (i = 0; i < len; ++i) { + const int diff_uv = ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + for (i = 0; i < len; ++i, ++A, ++B) { + const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4; + const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4; + out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + } +} +#endif // !WEBP_NEON_OMIT_C_CODE + +#undef MAX_Y + +//----------------------------------------------------------------------------- + +uint64_t (*SharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref, + uint16_t* dst, int len); +void (*SharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst, + int len); +void (*SharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out); + +extern void InitSharpYUVSSE2(void); +extern void InitSharpYUVNEON(void); + +WEBP_DSP_INIT_FUNC(InitSharpYuv) { + +#if !WEBP_NEON_OMIT_C_CODE + SharpYUVUpdateY = SharpYUVUpdateY_C; + SharpYUVUpdateRGB = SharpYUVUpdateRGB_C; + SharpYUVFilterRow = SharpYUVFilterRow_C; +#endif + +#if defined(WEBP_HAVE_SSE2) + if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSSE2)) { + InitSharpYUVSSE2(); + } +#endif // WEBP_HAVE_SSE2 + +#if defined(WEBP_HAVE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + InitSharpYUVNEON(); + } +#endif // WEBP_HAVE_NEON + + assert(SharpYUVUpdateY != NULL); + assert(SharpYUVUpdateRGB != NULL); + assert(SharpYUVFilterRow != NULL); +} diff --git a/sharpyuv/sharpyuv_dsp.h b/sharpyuv/sharpyuv_dsp.h new file mode 100644 index 00000000..d4905f8e --- /dev/null +++ b/sharpyuv/sharpyuv_dsp.h @@ -0,0 +1,28 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Speed-critical functions for Sharp YUV. + +#ifndef WEBP_SHARPYUV_DSP_H_ +#define WEBP_SHARPYUV_DSP_H_ + +#include + +#include "src/dsp/cpu.h" + +extern uint64_t (*SharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref, + uint16_t* dst, int len); +extern void (*SharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref, + int16_t* dst, int len); +extern void (*SharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out); + +void InitSharpYuv(void); + +#endif // WEBP_SHARPYUV_DSP_H_ diff --git a/sharpyuv/sharpyuv_neon.c b/sharpyuv/sharpyuv_neon.c new file mode 100644 index 00000000..8528c951 --- /dev/null +++ b/sharpyuv/sharpyuv_neon.c @@ -0,0 +1,133 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Speed-critical functions for Sharp YUV. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "sharpyuv/sharpyuv_dsp.h" + +#if defined(WEBP_USE_NEON) +#include +#include +#include +#endif + +extern void InitSharpYUVNEON(void); + +#if defined(WEBP_USE_NEON) + +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y_NEON(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + int i; + const int16x8_t zero = vdupq_n_s16(0); + const int16x8_t max = vdupq_n_s16(MAX_Y); + uint64x2_t sum = vdupq_n_u64(0); + uint64_t diff; + + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); + const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); + const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); + const int16x8_t D = vsubq_s16(A, B); // diff_y + const int16x8_t F = vaddq_s16(C, D); // new_y + const uint16x8_t H = + vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); + const int16x8_t I = vabsq_s16(D); // abs(diff_y) + vst1q_u16(dst + i, H); + sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); + } + diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); + for (; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)(dst[i]) + diff_y; + dst[i] = clip_y_NEON(new_y); + diff += (uint64_t)(abs(diff_y)); + } + return diff; +} + +static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i; + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t A = vld1q_s16(ref + i); + const int16x8_t B = vld1q_s16(src + i); + const int16x8_t C = vld1q_s16(dst + i); + const int16x8_t D = vsubq_s16(A, B); // diff_uv + const int16x8_t E = vaddq_s16(C, D); // new_uv + vst1q_s16(dst + i, E); + } + for (; i < len; ++i) { + const int diff_uv = ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + const int16x8_t max = vdupq_n_s16(MAX_Y); + const int16x8_t zero = vdupq_n_s16(0); + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t a0 = vld1q_s16(A + i + 0); + const int16x8_t a1 = vld1q_s16(A + i + 1); + const int16x8_t b0 = vld1q_s16(B + i + 0); + const int16x8_t b1 = vld1q_s16(B + i + 1); + const int16x8_t a0b1 = vaddq_s16(a0, b1); + const int16x8_t a1b0 = vaddq_s16(a1, b0); + const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 + const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) + const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) + const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); + const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); + const int16x8_t d0 = vaddq_s16(c1, a0); + const int16x8_t d1 = vaddq_s16(c0, a1); + const int16x8_t e0 = vrshrq_n_s16(d0, 1); + const int16x8_t e1 = vrshrq_n_s16(d1, 1); + const int16x8x2_t f = vzipq_s16(e0, e1); + const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); + const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); + const int16x8_t h0 = vaddq_s16(g0, f.val[0]); + const int16x8_t h1 = vaddq_s16(g1, f.val[1]); + const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); + const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); + vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); + vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); + } + for (; i < len; ++i) { + const int a0b1 = A[i + 0] + B[i + 1]; + const int a1b0 = A[i + 1] + B[i + 0]; + const int a0a1b0b1 = a0b1 + a1b0 + 8; + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; + out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1); + } +} +#undef MAX_Y + +//------------------------------------------------------------------------------ + +WEBP_TSAN_IGNORE_FUNCTION void InitSharpYUVNEON(void) { + SharpYUVUpdateY = SharpYUVUpdateY_NEON; + SharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON; + SharpYUVFilterRow = SharpYUVFilterRow_NEON; +} + +#else // !WEBP_USE_NEON + +void InitSharpYUVNEON(void) {} + +#endif // WEBP_USE_NEON diff --git a/sharpyuv/sharpyuv_sse2.c b/sharpyuv/sharpyuv_sse2.c new file mode 100644 index 00000000..719868e6 --- /dev/null +++ b/sharpyuv/sharpyuv_sse2.c @@ -0,0 +1,143 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Speed-critical functions for Sharp YUV. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "sharpyuv/sharpyuv_dsp.h" + +#if defined(WEBP_USE_SSE2) +#include +#include +#endif + +extern void InitSharpYUVSSE2(void); + +#if defined(WEBP_USE_SSE2) + +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + uint64_t diff = 0; + uint32_t tmp[4]; + int i; + const __m128i zero = _mm_setzero_si128(); + const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i one = _mm_set1_epi16(1); + __m128i sum = zero; + + for (i = 0; i + 8 <= len; i += 8) { + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); + const __m128i D = _mm_sub_epi16(A, B); // diff_y + const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0) + const __m128i F = _mm_add_epi16(C, D); // new_y + const __m128i G = _mm_or_si128(E, one); // -1 or 1 + const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); + const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...)) + _mm_storeu_si128((__m128i*)(dst + i), H); + sum = _mm_add_epi32(sum, I); + } + _mm_storeu_si128((__m128i*)tmp, sum); + diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + for (; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)dst[i] + diff_y; + dst[i] = clip_y(new_y); + diff += (uint64_t)abs(diff_y); + } + return diff; +} + +static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i = 0; + for (i = 0; i + 8 <= len; i += 8) { + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); + const __m128i D = _mm_sub_epi16(A, B); // diff_uv + const __m128i E = _mm_add_epi16(C, D); // new_uv + _mm_storeu_si128((__m128i*)(dst + i), E); + } + for (; i < len; ++i) { + const int diff_uv = ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + const __m128i kCst8 = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i zero = _mm_setzero_si128(); + for (i = 0; i + 8 <= len; i += 8) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); + const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); + const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); + const __m128i a0b1 = _mm_add_epi16(a0, b1); + const __m128i a1b0 = _mm_add_epi16(a1, b0); + const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1 + const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); + const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1) + const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0) + const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); + const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); + const __m128i d0 = _mm_add_epi16(c1, a0); + const __m128i d1 = _mm_add_epi16(c0, a1); + const __m128i e0 = _mm_srai_epi16(d0, 1); + const __m128i e1 = _mm_srai_epi16(d1, 1); + const __m128i f0 = _mm_unpacklo_epi16(e0, e1); + const __m128i f1 = _mm_unpackhi_epi16(e0, e1); + const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); + const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); + const __m128i h0 = _mm_add_epi16(g0, f0); + const __m128i h1 = _mm_add_epi16(g1, f1); + const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); + const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); + _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); + _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); + } + for (; i < len; ++i) { + // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = + // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 + // We reuse the common sub-expressions. + const int a0b1 = A[i + 0] + B[i + 1]; + const int a1b0 = A[i + 1] + B[i + 0]; + const int a0a1b0b1 = a0b1 + a1b0 + 8; + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; + out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + } +} +#undef MAX_Y + +//------------------------------------------------------------------------------ + +extern void InitSharpYUVSSE2(void); + +WEBP_TSAN_IGNORE_FUNCTION void InitSharpYUVSSE2(void) { + SharpYUVUpdateY = SharpYUVUpdateY_SSE2; + SharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2; + SharpYUVFilterRow = SharpYUVFilterRow_SSE2; +} +#else // !WEBP_USE_SSE2 + +void InitSharpYUVSSE2(void) {} + +#endif // WEBP_USE_SSE2 diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index 766bb0a7..7db4ef0f 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -24,6 +24,7 @@ commondir = $(includedir)/webp COMMON_SOURCES = COMMON_SOURCES += alpha_processing.c COMMON_SOURCES += cpu.c +COMMON_SOURCES += cpu.h COMMON_SOURCES += dec.c COMMON_SOURCES += dec_clip_tables.c COMMON_SOURCES += dsp.h diff --git a/src/dsp/cpu.h b/src/dsp/cpu.h new file mode 100644 index 00000000..40711b2f --- /dev/null +++ b/src/dsp/cpu.h @@ -0,0 +1,239 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// CPU detection functions and macros. +// +// Author: Skal (pascal.massimino@gmail.com) + +#ifndef WEBP_DSP_CPU_H_ +#define WEBP_DSP_CPU_H_ + +#ifdef HAVE_CONFIG_H +#include "src/webp/config.h" +#endif + +#include "src/webp/types.h" + +#if defined(__GNUC__) +#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_GCC_VERSION 0 +#define LOCAL_GCC_PREREQ(maj, min) 0 +#endif + +#if defined(__clang__) +#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) +#define LOCAL_CLANG_PREREQ(maj, min) \ + (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_CLANG_VERSION 0 +#define LOCAL_CLANG_PREREQ(maj, min) 0 +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if !defined(HAVE_CONFIG_H) +#if defined(_MSC_VER) && _MSC_VER > 1310 && \ + (defined(_M_X64) || defined(_M_IX86)) +#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1500 && \ + (defined(_M_X64) || defined(_M_IX86)) +#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets +#endif +#endif + +// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp +// files without intrinsics, allowing the corresponding Init() to be called. +// Files containing intrinsics will need to be built targeting the instruction +// set so should succeed on one of the earlier tests. +#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2)) +#define WEBP_USE_SSE2 +#endif + +#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2) +#define WEBP_HAVE_SSE2 +#endif + +#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41)) +#define WEBP_USE_SSE41 +#endif + +#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41) +#define WEBP_HAVE_SSE41 +#endif + +#undef WEBP_MSC_SSE41 +#undef WEBP_MSC_SSE2 + +// The intrinsics currently cause compiler errors with arm-nacl-gcc and the +// inline assembly would need to be modified for use with Native Client. +#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \ + !defined(__native_client__) +#define WEBP_USE_NEON +#endif + +#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ + defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) +#define WEBP_ANDROID_NEON // Android targets that may have NEON +#define WEBP_USE_NEON +#endif + +// Note: ARM64 is supported in Visual Studio 2017, but requires the direct +// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in +// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with +// vtbl4_u8(); a fix was made in 16.6. +#if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \ + (_MSC_VER >= 1926 && defined(_M_ARM64))) +#define WEBP_USE_NEON +#define WEBP_USE_INTRINSICS +#endif + +#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) +#define WEBP_HAVE_NEON +#endif + +#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \ + (__mips_isa_rev >= 1) && (__mips_isa_rev < 6) +#define WEBP_USE_MIPS32 +#if (__mips_isa_rev >= 2) +#define WEBP_USE_MIPS32_R2 +#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) +#define WEBP_USE_MIPS_DSP_R2 +#endif +#endif +#endif + +#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) +#define WEBP_USE_MSA +#endif + +#ifndef WEBP_DSP_OMIT_C_CODE +#define WEBP_DSP_OMIT_C_CODE 1 +#endif + +#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE +#define WEBP_NEON_OMIT_C_CODE 1 +#else +#define WEBP_NEON_OMIT_C_CODE 0 +#endif + +#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \ + defined(__aarch64__)) +#define WEBP_NEON_WORK_AROUND_GCC 1 +#else +#define WEBP_NEON_WORK_AROUND_GCC 0 +#endif + +// This macro prevents thread_sanitizer from reporting known concurrent writes. +#define WEBP_TSAN_IGNORE_FUNCTION +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#undef WEBP_TSAN_IGNORE_FUNCTION +#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread)) +#endif +#endif + +#if defined(WEBP_USE_THREAD) && !defined(_WIN32) +#include // NOLINT + +#define WEBP_DSP_INIT(func) \ + do { \ + static volatile VP8CPUInfo func##_last_cpuinfo_used = \ + (VP8CPUInfo)&func##_last_cpuinfo_used; \ + static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \ + if (pthread_mutex_lock(&func##_lock)) break; \ + if (func##_last_cpuinfo_used != VP8GetCPUInfo) func(); \ + func##_last_cpuinfo_used = VP8GetCPUInfo; \ + (void)pthread_mutex_unlock(&func##_lock); \ + } while (0) +#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) +#define WEBP_DSP_INIT(func) \ + do { \ + static volatile VP8CPUInfo func##_last_cpuinfo_used = \ + (VP8CPUInfo)&func##_last_cpuinfo_used; \ + if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \ + func(); \ + func##_last_cpuinfo_used = VP8GetCPUInfo; \ + } while (0) +#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) + +// Defines an Init + helper function that control multiple initialization of +// function pointers / tables. +/* Usage: + WEBP_DSP_INIT_FUNC(InitFunc) { + ...function body + } +*/ +#define WEBP_DSP_INIT_FUNC(name) \ + static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void); \ + WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \ + static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void) + +#define WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#if defined(__clang__) && defined(__has_attribute) +#if __has_attribute(no_sanitize) +// This macro prevents the undefined behavior sanitizer from reporting +// failures. This is only meant to silence unaligned loads on platforms that +// are known to support them. +#undef WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined"))) + +// This macro prevents the undefined behavior sanitizer from reporting +// failures related to unsigned integer overflows. This is only meant to +// silence cases where this well defined behavior is expected. +#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#endif + +// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'. +// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning. +#if !defined(WEBP_OFFSET_PTR) +#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off))) +#endif + +// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) +#if !defined(WEBP_SWAP_16BIT_CSP) +#define WEBP_SWAP_16BIT_CSP 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + +typedef enum { + kSSE2, + kSSE3, + kSlowSSSE3, // special feature for slow SSSE3 architectures + kSSE4_1, + kAVX, + kAVX2, + kNEON, + kMIPS32, + kMIPSdspR2, + kMSA +} CPUFeature; +// returns true if the CPU supports the feature. +typedef int (*VP8CPUInfo)(CPUFeature feature); +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; + +#endif // WEBP_DSP_CPU_H_ diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 73a66be2..05a3a35e 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -18,6 +18,7 @@ #include "src/webp/config.h" #endif +#include "src/dsp/cpu.h" #include "src/webp/types.h" #ifdef __cplusplus @@ -43,226 +44,6 @@ extern "C" { #define WEBP_RESTRICT #endif -//------------------------------------------------------------------------------ -// CPU detection - -#if defined(__GNUC__) -# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) -# define LOCAL_GCC_PREREQ(maj, min) \ - (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) -#else -# define LOCAL_GCC_VERSION 0 -# define LOCAL_GCC_PREREQ(maj, min) 0 -#endif - -#if defined(__clang__) -# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) -# define LOCAL_CLANG_PREREQ(maj, min) \ - (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) -#else -# define LOCAL_CLANG_VERSION 0 -# define LOCAL_CLANG_PREREQ(maj, min) 0 -#endif - -#ifndef __has_builtin -# define __has_builtin(x) 0 -#endif - -#if !defined(HAVE_CONFIG_H) -#if defined(_MSC_VER) && _MSC_VER > 1310 && \ - (defined(_M_X64) || defined(_M_IX86)) -#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets -#endif - -#if defined(_MSC_VER) && _MSC_VER >= 1500 && \ - (defined(_M_X64) || defined(_M_IX86)) -#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets -#endif -#endif - -// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp -// files without intrinsics, allowing the corresponding Init() to be called. -// Files containing intrinsics will need to be built targeting the instruction -// set so should succeed on one of the earlier tests. -#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \ - (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2)) -#define WEBP_USE_SSE2 -#endif - -#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2) -#define WEBP_HAVE_SSE2 -#endif - -#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \ - (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41)) -#define WEBP_USE_SSE41 -#endif - -#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41) -#define WEBP_HAVE_SSE41 -#endif - -#undef WEBP_MSC_SSE41 -#undef WEBP_MSC_SSE2 - -// The intrinsics currently cause compiler errors with arm-nacl-gcc and the -// inline assembly would need to be modified for use with Native Client. -#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ - (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \ - !defined(__native_client__) -#define WEBP_USE_NEON -#endif - -#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ - defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) -#define WEBP_ANDROID_NEON // Android targets that may have NEON -#define WEBP_USE_NEON -#endif - -// Note: ARM64 is supported in Visual Studio 2017, but requires the direct -// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in -// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with -// vtbl4_u8(); a fix was made in 16.6. -#if defined(_MSC_VER) && \ - ((_MSC_VER >= 1700 && defined(_M_ARM)) || \ - (_MSC_VER >= 1926 && defined(_M_ARM64))) -#define WEBP_USE_NEON -#define WEBP_USE_INTRINSICS -#endif - -#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) -#define WEBP_HAVE_NEON -#endif - -#if defined(__mips__) && !defined(__mips64) && \ - defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6) -#define WEBP_USE_MIPS32 -#if (__mips_isa_rev >= 2) -#define WEBP_USE_MIPS32_R2 -#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) -#define WEBP_USE_MIPS_DSP_R2 -#endif -#endif -#endif - -#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) -#define WEBP_USE_MSA -#endif - -#ifndef WEBP_DSP_OMIT_C_CODE -#define WEBP_DSP_OMIT_C_CODE 1 -#endif - -#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE -#define WEBP_NEON_OMIT_C_CODE 1 -#else -#define WEBP_NEON_OMIT_C_CODE 0 -#endif - -#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) -#define WEBP_NEON_WORK_AROUND_GCC 1 -#else -#define WEBP_NEON_WORK_AROUND_GCC 0 -#endif - -// This macro prevents thread_sanitizer from reporting known concurrent writes. -#define WEBP_TSAN_IGNORE_FUNCTION -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#undef WEBP_TSAN_IGNORE_FUNCTION -#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread)) -#endif -#endif - -#if defined(WEBP_USE_THREAD) && !defined(_WIN32) -#include // NOLINT - -#define WEBP_DSP_INIT(func) do { \ - static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ - (VP8CPUInfo)&func ## _last_cpuinfo_used; \ - static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \ - if (pthread_mutex_lock(&func ## _lock)) break; \ - if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func(); \ - func ## _last_cpuinfo_used = VP8GetCPUInfo; \ - (void)pthread_mutex_unlock(&func ## _lock); \ -} while (0) -#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) -#define WEBP_DSP_INIT(func) do { \ - static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ - (VP8CPUInfo)&func ## _last_cpuinfo_used; \ - if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break; \ - func(); \ - func ## _last_cpuinfo_used = VP8GetCPUInfo; \ -} while (0) -#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) - -// Defines an Init + helper function that control multiple initialization of -// function pointers / tables. -/* Usage: - WEBP_DSP_INIT_FUNC(InitFunc) { - ...function body - } -*/ -#define WEBP_DSP_INIT_FUNC(name) \ - static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \ - WEBP_TSAN_IGNORE_FUNCTION void name(void) { \ - WEBP_DSP_INIT(name ## _body); \ - } \ - static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void) - -#define WEBP_UBSAN_IGNORE_UNDEF -#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW -#if defined(__clang__) && defined(__has_attribute) -#if __has_attribute(no_sanitize) -// This macro prevents the undefined behavior sanitizer from reporting -// failures. This is only meant to silence unaligned loads on platforms that -// are known to support them. -#undef WEBP_UBSAN_IGNORE_UNDEF -#define WEBP_UBSAN_IGNORE_UNDEF \ - __attribute__((no_sanitize("undefined"))) - -// This macro prevents the undefined behavior sanitizer from reporting -// failures related to unsigned integer overflows. This is only meant to -// silence cases where this well defined behavior is expected. -#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW -#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ - __attribute__((no_sanitize("unsigned-integer-overflow"))) -#endif -#endif - -// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'. -// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning. -#if !defined(WEBP_OFFSET_PTR) -#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off))) -#endif - -// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) -#if !defined(WEBP_SWAP_16BIT_CSP) -#define WEBP_SWAP_16BIT_CSP 0 -#endif - -// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) -#if !defined(WORDS_BIGENDIAN) && \ - (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ - (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) -#define WORDS_BIGENDIAN -#endif - -typedef enum { - kSSE2, - kSSE3, - kSlowSSSE3, // special feature for slow SSSE3 architectures - kSSE4_1, - kAVX, - kAVX2, - kNEON, - kMIPS32, - kMIPSdspR2, - kMSA -} CPUFeature; -// returns true if the CPU supports the feature. -typedef int (*VP8CPUInfo)(CPUFeature feature); -WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; //------------------------------------------------------------------------------ // Init stub generator diff --git a/src/utils/Makefile.am b/src/utils/Makefile.am index fbb0fe73..a4bff8b9 100644 --- a/src/utils/Makefile.am +++ b/src/utils/Makefile.am @@ -9,6 +9,7 @@ common_HEADERS = ../webp/types.h commondir = $(includedir)/webp noinst_HEADERS = +noinst_HEADERS += ../dsp/cpu.h noinst_HEADERS += ../dsp/dsp.h noinst_HEADERS += ../webp/decode.h noinst_HEADERS += ../webp/encode.h