diff --git a/Android.mk b/Android.mk index 6872060e..c123c657 100644 --- a/Android.mk +++ b/Android.mk @@ -48,6 +48,7 @@ dsp_dec_srcs := \ src/dsp/dec_mips_dsp_r2.c \ src/dsp/dec_neon.$(NEON) \ src/dsp/dec_sse2.c \ + src/dsp/dec_sse41.c \ src/dsp/filters.c \ src/dsp/filters_mips_dsp_r2.c \ src/dsp/filters_sse2.c \ diff --git a/Makefile.vc b/Makefile.vc index 74ccd64f..176de81d 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -195,6 +195,7 @@ DSP_DEC_OBJS = \ $(DIROBJ)\dsp\dec_mips_dsp_r2.obj \ $(DIROBJ)\dsp\dec_neon.obj \ $(DIROBJ)\dsp\dec_sse2.obj \ + $(DIROBJ)\dsp\dec_sse41.obj \ $(DIROBJ)\dsp\filters.obj \ $(DIROBJ)\dsp\filters_mips_dsp_r2.obj \ $(DIROBJ)\dsp\filters_sse2.obj \ diff --git a/configure.ac b/configure.ac index 58ad34f2..5ef4ac0b 100644 --- a/configure.ac +++ b/configure.ac @@ -97,6 +97,17 @@ AS_IF([test -n "$AVX2_FLAGS"], [ CFLAGS=$SAVED_CFLAGS]) AC_SUBST([AVX2_FLAGS]) +TEST_AND_ADD_CFLAGS([SSE41_FLAGS], [-msse4.1]) +AS_IF([test -n "$SSE41_FLAGS"], [ + SAVED_CFLAGS=$CFLAGS + CFLAGS="$CFLAGS $SSE41_FLAGS" + AC_CHECK_HEADER([smmintrin.h], + [AC_DEFINE(WEBP_HAVE_SSE41, [1], + [Set to 1 if SSE4.1 is supported])], + [SSE41_FLAGS=""]) + CFLAGS=$SAVED_CFLAGS]) +AC_SUBST([SSE41_FLAGS]) + TEST_AND_ADD_CFLAGS([SSE2_FLAGS], [-msse2]) AS_IF([test -n "$SSE2_FLAGS"], [ SAVED_CFLAGS=$CFLAGS diff --git a/makefile.unix b/makefile.unix index 277e5e4f..7011c20d 100644 --- a/makefile.unix +++ b/makefile.unix @@ -119,6 +119,7 @@ DSP_DEC_OBJS = \ src/dsp/dec_mips_dsp_r2.o \ src/dsp/dec_neon.o \ src/dsp/dec_sse2.o \ + src/dsp/dec_sse41.o \ src/dsp/filters.o \ src/dsp/filters_mips_dsp_r2.o \ src/dsp/filters_sse2.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index 30e4be86..06f11a95 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -1,5 +1,6 @@ noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la +noinst_LTLIBRARIES += libwebpdspdecode_sse41.la if BUILD_LIBWEBPDECODER noinst_LTLIBRARIES += libwebpdspdecode.la @@ -54,6 +55,11 @@ libwebpdsp_avx2_la_SOURCES += enc_avx2.c libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS) +libwebpdspdecode_sse41_la_SOURCES = +libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c +libwebpdspdecode_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) +libwebpdspdecode_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS) + libwebpdspdecode_sse2_la_SOURCES = libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c @@ -81,12 +87,16 @@ noinst_HEADERS += ../webp/decode.h libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP) libwebpdsp_la_LDFLAGS = -lm -libwebpdsp_la_LIBADD = libwebpdsp_avx2.la libwebpdsp_sse2.la +libwebpdsp_la_LIBADD = +libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la +libwebpdsp_la_LIBADD += libwebpdspdecode_sse41.la if BUILD_LIBWEBPDECODER libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES) libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS) - libwebpdspdecode_la_LIBADD = libwebpdspdecode_sse2.la + libwebpdspdecode_la_LIBADD = + libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la + libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la endif diff --git a/src/dsp/dec.c b/src/dsp/dec.c index d42cdc0e..dd8578e5 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -674,6 +674,7 @@ VP8SimpleFilterFunc VP8SimpleVFilter16i; VP8SimpleFilterFunc VP8SimpleHFilter16i; extern void VP8DspInitSSE2(void); +extern void VP8DspInitSSE41(void); extern void VP8DspInitNEON(void); extern void VP8DspInitMIPS32(void); extern void VP8DspInitMIPSdspR2(void); @@ -738,6 +739,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { #if defined(WEBP_USE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8DspInitSSE2(); +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + VP8DspInitSSE41(); + } +#endif } #endif #if defined(WEBP_USE_NEON) diff --git a/src/dsp/dec_sse41.c b/src/dsp/dec_sse41.c new file mode 100644 index 00000000..4ae52fec --- /dev/null +++ b/src/dsp/dec_sse41.c @@ -0,0 +1,38 @@ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE4 version of some decoding functions. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_SSE41) + +#include +#include "../dec/vp8i.h" + +static void HE16(uint8_t* dst) { // horizontal + int j; + const __m128i kShuffle3 = _mm_set1_epi8(3); + for (j = 16; j > 0; --j) { + const __m128i in = _mm_cvtsi32_si128(*(int*)(dst - 4)); + const __m128i values = _mm_shuffle_epi8(in, kShuffle3); + _mm_storeu_si128((__m128i*)dst, values); + dst += BPS; + } +} +#endif // WEBP_USE_SSE41 + +extern void VP8DspInitSSE41(void); +WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) { +#if defined(WEBP_USE_SSE41) + VP8PredLuma16[3] = HE16; +#endif // WEBP_USE_SSE41 +} diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 8ed5b104..9f1c072f 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -52,6 +52,11 @@ extern "C" { #define WEBP_MSC_SSE2 // Visual C++ SSE2 targets #endif +#if defined(_MSC_VER) && _MSC_VER >= 1500 && \ + (defined(_M_X64) || defined(_M_IX86)) +#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets +#endif + // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp // files without intrinsics, allowing the corresponding Init() to be called. // Files containing intrinsics will need to be built targeting the instruction @@ -60,6 +65,10 @@ extern "C" { #define WEBP_USE_SSE2 #endif +#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41) +#define WEBP_USE_SSE41 +#endif + #if defined(__AVX2__) || defined(WEBP_HAVE_AVX2) #define WEBP_USE_AVX2 #endif