From bfc300c7ff6771238cc7e840e4901cae31196d77 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Fri, 19 Jun 2015 14:23:38 +0200 Subject: [PATCH] SSE4.1 implementation of some alpha-processing functions DispatchAlpha* functions are hard to speed up, compared to SSE2. ExtractAlpha sees a ~15% speed-up though. Change-Id: I8715c2defecbc832f469eed7e6ffd012146b52de --- Android.mk | 1 + Makefile.vc | 1 + makefile.unix | 1 + src/dsp/Makefile.am | 1 + src/dsp/alpha_processing.c | 6 +++ src/dsp/alpha_processing_sse41.c | 92 ++++++++++++++++++++++++++++++++ 6 files changed, 102 insertions(+) create mode 100644 src/dsp/alpha_processing_sse41.c diff --git a/Android.mk b/Android.mk index 0e8b4a14..37a60b67 100644 --- a/Android.mk +++ b/Android.mk @@ -38,6 +38,7 @@ dsp_dec_srcs := \ src/dsp/alpha_processing.c \ src/dsp/alpha_processing_mips_dsp_r2.c \ src/dsp/alpha_processing_sse2.c \ + src/dsp/alpha_processing_sse41.c \ src/dsp/argb.c \ src/dsp/argb_mips_dsp_r2.c \ src/dsp/argb_sse2.c \ diff --git a/Makefile.vc b/Makefile.vc index d194b510..d0ddb425 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -189,6 +189,7 @@ DSP_DEC_OBJS = \ $(DIROBJ)\dsp\alpha_processing.obj \ $(DIROBJ)\dsp\alpha_processing_mips_dsp_r2.obj \ $(DIROBJ)\dsp\alpha_processing_sse2.obj \ + $(DIROBJ)\dsp\alpha_processing_sse41.obj \ $(DIROBJ)\dsp\cpu.obj \ $(DIROBJ)\dsp\dec.obj \ $(DIROBJ)\dsp\dec_clip_tables.obj \ diff --git a/makefile.unix b/makefile.unix index 37013f78..9d508e66 100644 --- a/makefile.unix +++ b/makefile.unix @@ -129,6 +129,7 @@ DSP_DEC_OBJS = \ src/dsp/alpha_processing.o \ src/dsp/alpha_processing_mips_dsp_r2.o \ src/dsp/alpha_processing_sse2.o \ + src/dsp/alpha_processing_sse41.o \ src/dsp/cpu.o \ src/dsp/dec.o \ src/dsp/dec_clip_tables.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index c03a8d3b..5e8f9a22 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -59,6 +59,7 @@ libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS) libwebpdspdecode_sse41_la_SOURCES = +libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c libwebpdspdecode_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdspdecode_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS) diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c index bef13f41..1716cace 100644 --- a/src/dsp/alpha_processing.c +++ b/src/dsp/alpha_processing.c @@ -345,6 +345,7 @@ int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); extern void WebPInitAlphaProcessingMIPSdspR2(void); extern void WebPInitAlphaProcessingSSE2(void); +extern void WebPInitAlphaProcessingSSE41(void); static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used = (VP8CPUInfo)&alpha_processing_last_cpuinfo_used; @@ -365,6 +366,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) { #if defined(WEBP_USE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitAlphaProcessingSSE2(); +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitAlphaProcessingSSE41(); + } +#endif } #endif #if defined(WEBP_USE_MIPS_DSP_R2) diff --git a/src/dsp/alpha_processing_sse41.c b/src/dsp/alpha_processing_sse41.c new file mode 100644 index 00000000..986fde94 --- /dev/null +++ b/src/dsp/alpha_processing_sse41.c @@ -0,0 +1,92 @@ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Utilities for processing transparent channel, SSE4.1 variant. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_SSE41) + +#include + +//------------------------------------------------------------------------------ + +static int ExtractAlpha(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { + // alpha_and stores an 'and' operation of all the alpha[] values. The final + // value is not 0xff if any of the alpha[] is not equal to 0xff. + uint32_t alpha_and = 0xff; + int i, j; + const __m128i all_0xff = _mm_set1_epi32(~0u); + __m128i all_alphas = all_0xff; + + // We must be able to access 3 extra bytes after the last written byte + // 'src[4 * width - 4]', because we don't know if alpha is the first or the + // last byte of the quadruplet. + const int limit = (width - 1) & ~15; + const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 12, 8, 4, 0); + const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, + 12, 8, 4, 0, -1, -1, -1, -1); + const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0, + -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1); + for (j = 0; j < height; ++j) { + const __m128i* src = (const __m128i*)argb; + for (i = 0; i < limit; i += 16) { + // load 64 argb bytes + const __m128i a0 = _mm_loadu_si128(src + 0); + const __m128i a1 = _mm_loadu_si128(src + 1); + const __m128i a2 = _mm_loadu_si128(src + 2); + const __m128i a3 = _mm_loadu_si128(src + 3); + const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0); + const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1); + const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2); + const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3); + const __m128i c0 = _mm_or_si128(b0, b1); + const __m128i c1 = _mm_or_si128(b2, b3); + const __m128i d0 = _mm_or_si128(c0, c1); + // store + _mm_storeu_si128((__m128i*)&alpha[i], d0); + // accumulate sixteen alpha 'and' in parallel + all_alphas = _mm_and_si128(all_alphas, d0); + src += 4; + } + for (; i < width; ++i) { + const uint32_t alpha_value = argb[4 * i]; + alpha[i] = alpha_value; + alpha_and &= alpha_value; + } + argb += argb_stride; + alpha += alpha_stride; + } + // Combine the sixteen alpha 'and' into an 8-bit mask. + alpha_and |= 0xff00u; // pretend the upper bits [8..15] were tested ok. + alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); + return (alpha_and == 0xffffu); +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void WebPInitAlphaProcessingSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) { + WebPExtractAlpha = ExtractAlpha; +} + +#else // !WEBP_USE_SSE41 + +WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41) + +#endif // WEBP_USE_SSE41