diff --git a/Android.mk b/Android.mk index 37a60b67..340e22c2 100644 --- a/Android.mk +++ b/Android.mk @@ -86,6 +86,7 @@ dsp_enc_srcs := \ src/dsp/lossless_enc_mips_dsp_r2.c \ src/dsp/lossless_enc_neon.$(NEON) \ src/dsp/lossless_enc_sse2.c \ + src/dsp/lossless_enc_sse41.c \ enc_srcs := \ src/enc/alpha.c \ diff --git a/Makefile.vc b/Makefile.vc index d0ddb425..55ba3c23 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -237,6 +237,7 @@ DSP_ENC_OBJS = \ $(DIROBJ)\dsp\lossless_enc_mips_dsp_r2.obj \ $(DIROBJ)\dsp\lossless_enc_neon.obj \ $(DIROBJ)\dsp\lossless_enc_sse2.obj \ + $(DIROBJ)\dsp\lossless_enc_sse41.obj \ EX_ANIM_UTIL_OBJS = \ $(DIROBJ)\examples\anim_util.obj \ diff --git a/makefile.unix b/makefile.unix index 9d508e66..3dbd475e 100644 --- a/makefile.unix +++ b/makefile.unix @@ -177,6 +177,7 @@ DSP_ENC_OBJS = \ src/dsp/lossless_enc_mips_dsp_r2.o \ src/dsp/lossless_enc_neon.o \ src/dsp/lossless_enc_sse2.o \ + src/dsp/lossless_enc_sse41.o \ ENC_OBJS = \ src/enc/alpha.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index 5e8f9a22..0981336b 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -86,6 +86,7 @@ libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la libwebpdsp_sse41_la_SOURCES = libwebpdsp_sse41_la_SOURCES += enc_sse41.c +libwebpdsp_sse41_la_SOURCES += lossless_enc_sse41.c libwebpdsp_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdsp_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS) libwebpdsp_sse41_la_LIBADD = libwebpdspdecode_sse41.la diff --git a/src/dsp/lossless_enc.c b/src/dsp/lossless_enc.c index 23ac590d..d20ce1f8 100644 --- a/src/dsp/lossless_enc.c +++ b/src/dsp/lossless_enc.c @@ -1221,6 +1221,7 @@ VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount; VP8LHistogramAddFunc VP8LHistogramAdd; extern void VP8LEncDspInitSSE2(void); +extern void VP8LEncDspInitSSE41(void); extern void VP8LEncDspInitNEON(void); extern void VP8LEncDspInitMIPS32(void); extern void VP8LEncDspInitMIPSdspR2(void); @@ -1256,6 +1257,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { #if defined(WEBP_USE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8LEncDspInitSSE2(); +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + VP8LEncDspInitSSE41(); + } +#endif } #endif #if defined(WEBP_USE_NEON) diff --git a/src/dsp/lossless_enc_sse41.c b/src/dsp/lossless_enc_sse41.c new file mode 100644 index 00000000..dd8625a1 --- /dev/null +++ b/src/dsp/lossless_enc_sse41.c @@ -0,0 +1,86 @@ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE4.1 variant of methods for lossless encoder +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_SSE41) +#include +#include +#include "./lossless.h" + +//------------------------------------------------------------------------------ +// Subtract-Green Transform + +static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { + int i; + const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9, + -1, 5, -1, 5, -1, 1, -1, 1); + for (i = 0; i + 4 <= num_pixels; i += 4) { + const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); + const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle); + const __m128i out = _mm_sub_epi8(in, in_0g0g); + _mm_storeu_si128((__m128i*)&argb_data[i], out); + } + // fallthrough and finish off with plain-C + VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); +} + +//------------------------------------------------------------------------------ +// Color Transform + +static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m, + uint32_t* argb_data, int num_pixels) { + // Shuffle constant to spread green and red to some *upper* byte locations. + const __m128i kCst_g0rg = _mm_set_epi8(5, -1, -1, -1, 6, -1, 5, -1, + 1, -1, -1, -1, 2, -1, 1, -1); + // Shuffling constant to collect deltas from uint32 to uint8 locations. + const __m128i kCstShuffle = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, + -1, 12, -1, 8, -1, 4, -1, 0); + // Used to collect the two parts of the delta (horizontal add) with madd. + const __m128i kCstAdd = _mm_set1_epi16(1); + // sign-extended multiplying constants, pre-shifted by 5. +#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend + const __m128i mults = _mm_set_epi16( + CST(green_to_red_), 0, CST(red_to_blue_), CST(green_to_blue_), + CST(green_to_red_), 0, CST(red_to_blue_), CST(green_to_blue_)); +#undef CST + + int i; + for (i = 0; i + 2 <= num_pixels; i += 2) { + const __m128i in = _mm_loadl_epi64((__m128i*)&argb_data[i]); // argb + const __m128i A = _mm_shuffle_epi8(in, kCst_g0rg); // g | 0 | r | g + const __m128i B = _mm_mulhi_epi16(A, mults); // dr | 0 | db1 | db2 + const __m128i C = _mm_madd_epi16(B, kCstAdd); // dr | 0 | db | 0 + const __m128i D = _mm_shuffle_epi8(C, kCstShuffle); // 0 | dr | 0 | db + const __m128i out = _mm_sub_epi8(in, D); + _mm_storel_epi64((__m128i*)&argb_data[i], out); + } + // fallthrough and finish off with plain-C + VP8LTransformColor_C(m, argb_data + i, num_pixels - i); +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8LEncDspInitSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; + VP8LTransformColor = TransformColor; +} + +#else // !WEBP_USE_SSE41 + +WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41) + +#endif // WEBP_USE_SSE41