Merge "Make libwebp depend on libsharpyuv." into main

This commit is contained in:
Maryla Ustarroz-Calonge 2022-03-07 09:34:37 +00:00 committed by Gerrit Code Review
commit d0d2292e1a
13 changed files with 51 additions and 717 deletions

View File

@ -33,6 +33,12 @@ else
NEON := c
endif
sharpyuv_srcs := \
sharpyuv/sharpyuv.c \
sharpyuv/sharpyuv_dsp.c \
sharpyuv/sharpyuv_neon.$(NEON) \
sharpyuv/sharpyuv_sse2.c \
dec_srcs := \
src/dec/alpha_dec.c \
src/dec/buffer_dec.c \
@ -204,6 +210,7 @@ endif # ENABLE_SHARED=1
include $(CLEAR_VARS)
LOCAL_SRC_FILES := \
$(sharpyuv_srcs) \
$(dsp_enc_srcs) \
$(enc_srcs) \
$(utils_enc_srcs) \

View File

@ -31,7 +31,6 @@ option(WEBP_BUILD_VWEBP "Build the vwebp viewer tool." ON)
option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." ON)
option(WEBP_BUILD_LIBWEBPMUX "Build the libwebpmux library." ON)
option(WEBP_BUILD_WEBPMUX "Build the webpmux command line tool." ON)
option(WEBP_BUILD_SHARPYUV "Build the sharpyuv library." OFF)
option(WEBP_BUILD_EXTRAS "Build extras." ON)
option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
option(WEBP_USE_THREAD "Enable threading support" ON)
@ -212,7 +211,6 @@ function(libwebp_add_stub_file TARGET)
target_sources(${TARGET} PRIVATE ${stub_source_file})
endfunction()
if(WEBP_BUILD_SHARPYUV)
parse_makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv "WEBP_SHARPYUV_SRCS"
"")
add_library(sharpyuv OBJECT ${WEBP_SHARPYUV_SRCS})
@ -223,7 +221,6 @@ if(WEBP_BUILD_SHARPYUV)
sharpyuv
PROPERTIES PUBLIC_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/sharpyuv/sharpyuv.h;\
${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")
endif()
if(MSVC)
# avoid security warnings for e.g., fopen() used in the examples.
@ -289,6 +286,7 @@ target_include_directories(webputils
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_SOURCE_DIR})
add_library(webp
$<TARGET_OBJECTS:sharpyuv>
$<TARGET_OBJECTS:webpdecode>
$<TARGET_OBJECTS:webpdsp>
$<TARGET_OBJECTS:webpencode>
@ -311,7 +309,8 @@ ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h")
# Make sure the OBJECT libraries are built with position independent code (it is
# not ON by default).
set_target_properties(webpdecode
set_target_properties(sharpyuv
webpdecode
webpdspdecode
webputilsdecode
webpencode

View File

@ -1,11 +1,7 @@
ACLOCAL_AMFLAGS = -I m4
SUBDIRS = src imageio man
SUBDIRS = sharpyuv src imageio man
EXTRA_DIST = COPYING autogen.sh
if BUILD_SHARPYUV
SUBDIRS += sharpyuv
endif
if BUILD_EXTRAS
SUBDIRS += extras
endif

View File

@ -81,6 +81,7 @@ OUTPUT_DIRS = $(DIRBIN) $(DIRINC) $(DIRLIB) \
$(DIROBJ)\extras \
$(DIROBJ)\imageio \
$(DIROBJ)\mux \
$(DIROBJ)\sharpyuv \
$(DIROBJ)\utils \
# Target configuration
@ -173,6 +174,12 @@ CFLAGS = $(CFLAGS) /D_UNICODE /DUNICODE
# A config was provided, so the library can be built.
#
SHARPYUV_OBJS = \
$(DIROBJ)\sharpyuv\sharpyuv.obj \
$(DIROBJ)\sharpyuv\sharpyuv_dsp.obj \
$(DIROBJ)\sharpyuv\sharpyuv_neon.obj \
$(DIROBJ)\sharpyuv\sharpyuv_sse2.obj \
DEC_OBJS = \
$(DIROBJ)\dec\alpha_dec.obj \
$(DIROBJ)\dec\buffer_dec.obj \
@ -334,8 +341,8 @@ UTILS_ENC_OBJS = \
$(DIROBJ)\utils\quant_levels_utils.obj \
LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
$(UTILS_ENC_OBJS) $(DLL_OBJS)
LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(SHARPYUV_OBJS) $(ENC_OBJS) \
$(DSP_ENC_OBJS) $(UTILS_ENC_OBJS) $(DLL_OBJS)
LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)
@ -481,6 +488,8 @@ $(DIROBJ)\examples\gifdec.obj: examples\gifdec.c
$(CC) $(CFLAGS) /Fd$(DIROBJ)\extras\ /Fo$(DIROBJ)\extras\ $<
{imageio}.c{$(DIROBJ)\imageio}.obj::
$(CC) $(CFLAGS) /Fd$(DIROBJ)\imageio\ /Fo$(DIROBJ)\imageio\ $<
{sharpyuv}.c{$(DIROBJ)\sharpyuv}.obj::
$(CC) $(CFLAGS) /Fd$(DIROBJ)\sharpyuv\ /Fo$(DIROBJ)\sharpyuv\ $<
{src\dec}.c{$(DIROBJ)\dec}.obj::
$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dec\ $<
{src\demux}.c{$(DIROBJ)\demux}.obj::

View File

@ -105,6 +105,11 @@ model {
sources {
c {
source {
srcDir "sharpyuv"
include "sharpyuv.c"
include "sharpyuv_dsp.c"
include "sharpyuv_neon.c"
include "sharpyuv_sse2.c"
srcDir "src/dec"
include "alpha_dec.c"
include "buffer_dec.c"

View File

@ -67,14 +67,6 @@ AC_ARG_ENABLE([libwebpextras],
AC_MSG_RESULT(${enable_libwebpextras-no})
AM_CONDITIONAL([BUILD_EXTRAS], [test "$enable_libwebpextras" = "yes"])
dnl === Check whether libsharpyuv should be built
AC_MSG_CHECKING(whether libsharpyuv is to be built)
AC_ARG_ENABLE([libsharpyuv],
AS_HELP_STRING([--enable-libsharpyuv],
[Build libsharpyuv @<:@default=no@:>@]))
AC_MSG_RESULT(${enable_libsharpyuv-no})
AM_CONDITIONAL([BUILD_SHARPYUV], [test "$enable_libsharpyuv" = "yes"])
dnl === If --enable-asserts is not defined, define NDEBUG
AC_MSG_CHECKING(whether asserts are enabled)
@ -782,7 +774,6 @@ libwebpdecoder: ${enable_libwebpdecoder-no}
libwebpdemux: ${enable_libwebpdemux-no}
libwebpmux: ${enable_libwebpmux-no}
libwebpextras: ${enable_libwebpextras-no}
libsharpyuv: ${enable_libsharpyuv-no}
Tools:
cwebp : ${enable_libwebpdemux-no}

View File

@ -125,6 +125,12 @@ endif
ANIM_UTIL_OBJS = \
examples/anim_util.o \
SHARPYUV_OBJS = \
sharpyuv/sharpyuv.o \
sharpyuv/sharpyuv_dsp.o \
sharpyuv/sharpyuv_neon.o \
sharpyuv/sharpyuv_sse2.o \
DEC_OBJS = \
src/dec/alpha_dec.o \
src/dec/buffer_dec.o \
@ -282,8 +288,8 @@ EXTRA_OBJS = \
extras/quality_estimate.o \
LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
$(UTILS_ENC_OBJS)
LIBWEBP_OBJS = $(SHARPYUV_OBJS) $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) \
$(DSP_ENC_OBJS) $(UTILS_ENC_OBJS)
LIBWEBPMUX_OBJS = $(MUX_OBJS)
LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)
LIBWEBPEXTRA_OBJS = $(EXTRA_OBJS)

View File

@ -332,15 +332,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
// utilities for accurate RGB->YUV conversion
extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len);
extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
int16_t* dst, int len);
extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
int len,
const uint16_t* best_y, uint16_t* out);
// Must be called before using the above.
void WebPInitConvertARGBToYUV(void);

View File

@ -194,50 +194,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
uint64_t diff = 0;
int i;
for (i = 0; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_y(new_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
for (i = 0; i < len; ++i, ++A, ++B) {
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MAX_Y
//-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@ -247,18 +203,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store);
uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len);
void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
int16_t* dst, int len);
void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out);
extern void WebPInitConvertARGBToYUVSSE2(void);
extern void WebPInitConvertARGBToYUVSSE41(void);
extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
extern void WebPInitSharpYUVNEON(void);
WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertARGBToY = ConvertARGBToY_C;
@ -269,17 +216,10 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitConvertARGBToYUVSSE2();
WebPInitSharpYUVSSE2();
}
#endif // WEBP_HAVE_SSE2
#if defined(WEBP_HAVE_SSE41)
@ -293,7 +233,6 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
#endif // WEBP_HAVE_NEON
@ -302,7 +241,4 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
assert(WebPConvertRGB24ToY != NULL);
assert(WebPConvertBGR24ToY != NULL);
assert(WebPConvertRGBA32ToUV != NULL);
assert(WebPSharpYUVUpdateY != NULL);
assert(WebPSharpYUVUpdateRGB != NULL);
assert(WebPSharpYUVFilterRow != NULL);
}

View File

@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y_NEON(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(MAX_Y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_y_NEON(new_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const int16x8_t max = vdupq_n_s16(MAX_Y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t d0 = vaddq_s16(c1, a0);
const int16x8_t d1 = vaddq_s16(c0, a1);
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
#endif // WEBP_USE_NEON

View File

@ -747,128 +747,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
uint64_t diff = 0;
uint32_t tmp[4];
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i max = _mm_set1_epi16(MAX_Y);
const __m128i one = _mm_set1_epi16(1);
__m128i sum = zero;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_y
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
const __m128i F = _mm_add_epi16(C, D); // new_y
const __m128i G = _mm_or_si128(E, one); // -1 or 1
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
_mm_storeu_si128((__m128i*)(dst + i), H);
sum = _mm_add_epi32(sum, I);
}
_mm_storeu_si128((__m128i*)tmp, sum);
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_y(new_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i = 0;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
const __m128i E = _mm_add_epi16(C, D); // new_uv
_mm_storeu_si128((__m128i*)(dst + i), E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const __m128i kCst8 = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(MAX_Y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 8 <= len; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
const __m128i a0b1 = _mm_add_epi16(a0, b1);
const __m128i a1b0 = _mm_add_epi16(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi16(c1, a0);
const __m128i d1 = _mm_add_epi16(c0, a1);
const __m128i e0 = _mm_srai_epi16(d0, 1);
const __m128i e1 = _mm_srai_epi16(d1, 1);
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
const __m128i h0 = _mm_add_epi16(g0, f0);
const __m128i h1 = _mm_add_epi16(g1, f1);
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
#endif // WEBP_USE_SSE2

View File

@ -37,6 +37,7 @@ libwebpencodeinclude_HEADERS += ../webp/types.h
noinst_HEADERS =
noinst_HEADERS += ../webp/format_constants.h
libwebpencode_la_LIBADD = ../../sharpyuv/libsharpyuv.la
libwebpencode_la_LDFLAGS = -lm
libwebpencode_la_CPPFLAGS = $(AM_CPPFLAGS)
libwebpencodeincludedir = $(includedir)/webp

View File

@ -15,6 +15,7 @@
#include <stdlib.h>
#include <math.h>
#include "sharpyuv/sharpyuv.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/random_utils.h"
#include "src/utils/utils.h"
@ -158,7 +159,6 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
//------------------------------------------------------------------------------
// Sharp RGB->YUV conversion
static const int kNumIterations = 4;
static const int kMinDimensionIterativeConversion = 4;
// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
@ -171,402 +171,23 @@ typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
#define MAX_Y_T ((256 << SFIX) - 1)
#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
#if defined(USE_GAMMA_COMPRESSION)
// We use tables of different size and precision for the Rec709 / BT2020
// transfer function.
#define kGammaF (1./0.45)
static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
#define GAMMA_TO_LINEAR_BITS 14
static uint32_t kGammaToLinearTabS[MAX_Y_T + 1]; // size scales with Y_FIX
static volatile int kGammaTablesSOk = 0;
static void InitGammaTablesS(void);
WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
if (!kGammaTablesSOk) {
int v;
const double norm = 1. / MAX_Y_T;
const double scale = 1. / kGammaTabSize;
const double a = 0.09929682680944;
const double thresh = 0.018053968510807;
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
for (v = 0; v <= MAX_Y_T; ++v) {
const double g = norm * v;
double value;
if (g <= thresh * 4.5) {
value = g / 4.5;
} else {
const double a_rec = 1. / (1. + a);
value = pow(a_rec * (g + a), kGammaF);
}
kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
}
for (v = 0; v <= kGammaTabSize; ++v) {
const double g = scale * v;
double value;
if (g <= thresh) {
value = 4.5 * g;
} else {
value = (1. + a) * pow(g, 1. / kGammaF) - a;
}
// we already incorporate the 1/2 rounding constant here
kLinearToGammaTabS[v] =
(uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
}
// to prevent small rounding errors to cause read-overflow:
kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
kGammaTablesSOk = 1;
}
}
// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
static WEBP_INLINE uint32_t GammaToLinearS(int v) {
return kGammaToLinearTabS[v];
}
static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
// 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
const uint32_t v = value * kGammaTabSize;
const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
// fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
// v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
// Final interpolation. Note that rounding is already included.
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
return result;
}
#else
static void InitGammaTablesS(void) {}
static WEBP_INLINE uint32_t GammaToLinearS(int v) {
return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
}
static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
}
#endif // USE_GAMMA_COMPRESSION
//------------------------------------------------------------------------------
static uint8_t clip_8b(fixed_t v) {
return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
}
static fixed_y_t clip_y(int y) {
return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
}
//------------------------------------------------------------------------------
static int RGBToGray(int r, int g, int b) {
const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
return (luma >> YUV_FIX);
}
static uint32_t ScaleDown(int a, int b, int c, int d) {
const uint32_t A = GammaToLinearS(a);
const uint32_t B = GammaToLinearS(b);
const uint32_t C = GammaToLinearS(c);
const uint32_t D = GammaToLinearS(d);
return LinearToGammaS((A + B + C + D + 2) >> 2);
}
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
int i;
for (i = 0; i < w; ++i) {
const uint32_t R = GammaToLinearS(src[0 * w + i]);
const uint32_t G = GammaToLinearS(src[1 * w + i]);
const uint32_t B = GammaToLinearS(src[2 * w + i]);
const uint32_t Y = RGBToGray(R, G, B);
dst[i] = (fixed_y_t)LinearToGammaS(Y);
}
}
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
fixed_t* dst, int uv_w) {
int i;
for (i = 0; i < uv_w; ++i) {
const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
const int W = RGBToGray(r, g, b);
dst[0 * uv_w] = (fixed_t)(r - W);
dst[1 * uv_w] = (fixed_t)(g - W);
dst[2 * uv_w] = (fixed_t)(b - W);
dst += 1;
src1 += 2;
src2 += 2;
}
}
static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
int i;
for (i = 0; i < w; ++i) {
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
}
}
//------------------------------------------------------------------------------
static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
const int v0 = (A * 3 + B + 2) >> 2;
return clip_y(v0 + W0);
}
//------------------------------------------------------------------------------
static WEBP_INLINE fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
return ((fixed_y_t)a << SFIX) | SHALF;
}
static void ImportOneRow(const uint8_t* const r_ptr,
const uint8_t* const g_ptr,
const uint8_t* const b_ptr,
int step,
int pic_width,
fixed_y_t* const dst) {
int i;
const int w = (pic_width + 1) & ~1;
for (i = 0; i < pic_width; ++i) {
const int off = i * step;
dst[i + 0 * w] = UpLift(r_ptr[off]);
dst[i + 1 * w] = UpLift(g_ptr[off]);
dst[i + 2 * w] = UpLift(b_ptr[off]);
}
if (pic_width & 1) { // replicate rightmost pixel
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
}
}
static void InterpolateTwoRows(const fixed_y_t* const best_y,
const fixed_t* prev_uv,
const fixed_t* cur_uv,
const fixed_t* next_uv,
int w,
fixed_y_t* out1,
fixed_y_t* out2) {
const int uv_w = w >> 1;
const int len = (w - 1) >> 1; // length to filter
int k = 3;
while (k-- > 0) { // process each R/G/B segments in turn
// special boundary case for i==0
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
// special boundary case for i == w - 1 when w is even
if (!(w & 1)) {
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
best_y[w - 1 + 0]);
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
best_y[w - 1 + w]);
}
out1 += w;
out2 += w;
prev_uv += uv_w;
cur_uv += uv_w;
next_uv += uv_w;
}
}
static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
}
static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
const int u = -9719 * r - 19081 * g + 28800 * b + SROUNDER;
return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
}
static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
const int v = +28800 * r - 24116 * g - 4684 * b + SROUNDER;
return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
}
static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
WebPPicture* const picture) {
int i, j;
uint8_t* dst_y = picture->y;
uint8_t* dst_u = picture->u;
uint8_t* dst_v = picture->v;
const fixed_t* const best_uv_base = best_uv;
const int w = (picture->width + 1) & ~1;
const int h = (picture->height + 1) & ~1;
const int uv_w = w >> 1;
const int uv_h = h >> 1;
for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
for (i = 0; i < picture->width; ++i) {
const int off = (i >> 1);
const int W = best_y[i];
const int r = best_uv[off + 0 * uv_w] + W;
const int g = best_uv[off + 1 * uv_w] + W;
const int b = best_uv[off + 2 * uv_w] + W;
dst_y[i] = ConvertRGBToY(r, g, b);
}
best_y += w;
best_uv += (j & 1) * 3 * uv_w;
dst_y += picture->y_stride;
}
for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
for (i = 0; i < uv_w; ++i) {
const int off = i;
const int r = best_uv[off + 0 * uv_w];
const int g = best_uv[off + 1 * uv_w];
const int b = best_uv[off + 2 * uv_w];
dst_u[i] = ConvertRGBToU(r, g, b);
dst_v[i] = ConvertRGBToV(r, g, b);
}
best_uv += 3 * uv_w;
dst_u += picture->uv_stride;
dst_v += picture->uv_stride;
}
return 1;
}
//------------------------------------------------------------------------------
// Main function
#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
static int PreprocessARGB(const uint8_t* r_ptr,
const uint8_t* g_ptr,
const uint8_t* b_ptr,
int step, int rgb_stride,
WebPPicture* const picture) {
// we expand the right/bottom border if needed
const int w = (picture->width + 1) & ~1;
const int h = (picture->height + 1) & ~1;
const int uv_w = w >> 1;
const int uv_h = h >> 1;
uint64_t prev_diff_y_sum = ~0;
int j, iter;
// TODO(skal): allocate one big memory chunk. But for now, it's easier
// for valgrind debugging to have several chunks.
fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
fixed_y_t* best_y = best_y_base;
fixed_y_t* target_y = target_y_base;
fixed_t* best_uv = best_uv_base;
fixed_t* target_uv = target_uv_base;
const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
int ok;
if (best_y_base == NULL || best_uv_base == NULL ||
target_y_base == NULL || target_uv_base == NULL ||
best_rgb_y == NULL || best_rgb_uv == NULL ||
tmp_buffer == NULL) {
int ok = SharpArgbToYuv(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture->y,
picture->y_stride, picture->u, picture->uv_stride,
picture->v, picture->uv_stride, picture->width,
picture->height);
if (!ok) {
ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
goto End;
}
assert(picture->width >= kMinDimensionIterativeConversion);
assert(picture->height >= kMinDimensionIterativeConversion);
WebPInitConvertARGBToYUV();
// Import RGB samples to W/RGB representation.
for (j = 0; j < picture->height; j += 2) {
const int is_last_row = (j == picture->height - 1);
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
// prepare two rows of input
ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
if (!is_last_row) {
ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
step, picture->width, src2);
} else {
memcpy(src2, src1, 3 * w * sizeof(*src2));
}
StoreGray(src1, best_y + 0, w);
StoreGray(src2, best_y + w, w);
UpdateW(src1, target_y, w);
UpdateW(src2, target_y + w, w);
UpdateChroma(src1, src2, target_uv, uv_w);
memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
best_y += 2 * w;
best_uv += 3 * uv_w;
target_y += 2 * w;
target_uv += 3 * uv_w;
r_ptr += 2 * rgb_stride;
g_ptr += 2 * rgb_stride;
b_ptr += 2 * rgb_stride;
}
// Iterate and resolve clipping conflicts.
for (iter = 0; iter < kNumIterations; ++iter) {
const fixed_t* cur_uv = best_uv_base;
const fixed_t* prev_uv = best_uv_base;
uint64_t diff_y_sum = 0;
best_y = best_y_base;
best_uv = best_uv_base;
target_y = target_y_base;
target_uv = target_uv_base;
for (j = 0; j < h; j += 2) {
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
{
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
prev_uv = cur_uv;
cur_uv = next_uv;
}
UpdateW(src1, best_rgb_y + 0 * w, w);
UpdateW(src2, best_rgb_y + 1 * w, w);
UpdateChroma(src1, src2, best_rgb_uv, uv_w);
// update two rows of Y and one row of RGB
diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
best_y += 2 * w;
best_uv += 3 * uv_w;
target_y += 2 * w;
target_uv += 3 * uv_w;
}
// test exit condition
if (iter > 0) {
if (diff_y_sum < diff_y_threshold) break;
if (diff_y_sum > prev_diff_y_sum) break;
}
prev_diff_y_sum = diff_y_sum;
}
// final reconstruction
ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
End:
WebPSafeFree(best_y_base);
WebPSafeFree(best_uv_base);
WebPSafeFree(target_y_base);
WebPSafeFree(target_uv_base);
WebPSafeFree(best_rgb_y);
WebPSafeFree(best_rgb_uv);
WebPSafeFree(tmp_buffer);
return ok;
}
#undef SAFE_ALLOC
//------------------------------------------------------------------------------
// "Fast" regular RGB->YUV
@ -874,7 +495,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
}
if (use_iterative_conversion) {
InitGammaTablesS();
if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
return 0;
}