From a6bb9b17d8e50e6156118cdc861059cbf7305040 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Thu, 11 Sep 2014 07:58:42 +0200 Subject: [PATCH] SSE2 for inverse Mult(ARGB)Row and ApplyAlphaMultiply Change-Id: Iab5c0e4a4d2b31f86736a9b277e62b6e28c3d2b4 WebPMultRow: ~7x faster WebPMultARGBRow: ~3x faster ApplyAlphaMultiply: 60% faster --- src/dsp/alpha_processing.c | 10 +-- src/dsp/alpha_processing_sse2.c | 137 ++++++++++++++++++++++++++++++++ src/dsp/dsp.h | 5 ++ 3 files changed, 147 insertions(+), 5 deletions(-) diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c index cc47844f..176135ab 100644 --- a/src/dsp/alpha_processing.c +++ b/src/dsp/alpha_processing.c @@ -134,7 +134,7 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) { #endif // USE_TABLES_FOR_ALPHA_MULT -static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { +void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) { int x; for (x = 0; x < width; ++x) { const uint32_t argb = ptr[x]; @@ -154,8 +154,8 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { } } -static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, - int width, int inverse) { +void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse) { int x; for (x = 0; x < width; ++x) { const uint32_t a = alpha[x]; @@ -315,8 +315,8 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void); extern void WebPInitAlphaProcessingSSE2(void); void WebPInitAlphaProcessing(void) { - WebPMultARGBRow = MultARGBRow; - WebPMultRow = MultRow; + WebPMultARGBRow = WebPMultARGBRowC; + WebPMultRow = WebPMultRowC; WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; WebPDispatchAlpha = DispatchAlpha; diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c index c8d95e1c..bc6adf80 100644 --- a/src/dsp/alpha_processing_sse2.c +++ b/src/dsp/alpha_processing_sse2.c @@ -74,6 +74,140 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_and != 0xff); } +//------------------------------------------------------------------------------ +// Non-dither premultiplied modes + +#define MULTIPLIER(a) ((a) * 0x8081) +#define PREMULTIPLY(x, m) (((x) * (m)) >> 23) + +// We can't use a 'const int' for the SHUFFLE value, because it has to be an +// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here. +#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do { \ + const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX)); \ + const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); \ + const __m128i alpha0 = _mm_and_si128(argb1, MASK); \ + const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE); \ + const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE); \ + /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */ \ + const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT); \ + const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT); \ + const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); \ + const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); \ + const __m128i argb4 = _mm_adds_epu16(argb2, argb3); \ + const __m128i argb5 = _mm_srli_epi16(argb4, 7); \ + const __m128i argb6 = _mm_or_si128(argb5, alpha0); \ + const __m128i argb7 = _mm_packus_epi16(argb6, zero); \ + _mm_storel_epi64((__m128i*)&(RGBX), argb7); \ +} while (0) + +static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, + int w, int h, int stride) { + const __m128i zero = _mm_setzero_si128(); + const int kSpan = 2; + const int w2 = w & ~(kSpan - 1); + while (h-- > 0) { + uint32_t* const rgbx = (uint32_t*)rgba; + int i; + if (!alpha_first) { + const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0); + const __m128i kMult = + _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081); + for (i = 0; i < w2; i += kSpan) { + APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult); + } + } else { + const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff); + const __m128i kMult = + _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0); + for (i = 0; i < w2; i += kSpan) { + APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult); + } + } + // Finish with left-overs. + for (; i < w; ++i) { + uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); + const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); + const uint32_t a = alpha[4 * i]; + if (a != 0xff) { + const uint32_t mult = MULTIPLIER(a); + rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); + rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); + rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); + } + } + rgba += stride; + } +} +#undef MULTIPLIER +#undef PREMULTIPLY + +// ----------------------------------------------------------------------------- +// Apply alpha value to rows + +// We use: kINV255 = (1 << 24) / 255 = 0x010101 +// So: a * kINV255 = (a << 16) | [(a << 8) | a] +// -> _mm_mulhi_epu16() takes care of the (a<<16) part, +// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one. + +static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { + int x = 0; + if (!inverse) { + const int kSpan = 2; + const __m128i zero = _mm_setzero_si128(); + const __m128i kRound = + _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); + const __m128i kMult = + _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); + const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); + const int w2 = width & ~(kSpan - 1); + for (x = 0; x < w2; x += kSpan) { + const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); + const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); + const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); + const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); + const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); + const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); + const __m128i scale1 = _mm_or_si128(tmp2, kOne64); + const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); + const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); + const __m128i argb4 = _mm_adds_epu16(argb2, argb3); + const __m128i argb5 = _mm_adds_epu16(argb4, kRound); + const __m128i argb6 = _mm_srli_epi16(argb5, 8); + const __m128i argb7 = _mm_packus_epi16(argb6, zero); + _mm_storel_epi64((__m128i*)&ptr[x], argb7); + } + } + width -= x; + if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); +} + +static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse) { + int x = 0; + if (!inverse) { + const int kSpan = 8; + const __m128i zero = _mm_setzero_si128(); + const __m128i kRound = _mm_set1_epi16(1 << 7); + const int w2 = width & ~(kSpan - 1); + for (x = 0; x < w2; x += kSpan) { + const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); + const __m128i v1 = _mm_unpacklo_epi8(v0, zero); + const __m128i alpha0 = _mm_loadl_epi64((__m128i*)&alpha[x]); + const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); + const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); + const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); + const __m128i v3 = _mm_mullo_epi16(v1, alpha1); + const __m128i v4 = _mm_adds_epu16(v2, v3); + const __m128i v5 = _mm_adds_epu16(v4, kRound); + const __m128i v6 = _mm_srli_epi16(v5, 8); + const __m128i v7 = _mm_packus_epi16(v6, zero); + _mm_storel_epi64((__m128i*)&ptr[x], v7); + } + } + width -= x; + if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); +} + #endif // WEBP_USE_SSE2 //------------------------------------------------------------------------------ @@ -83,6 +217,9 @@ extern void WebPInitAlphaProcessingSSE2(void); void WebPInitAlphaProcessingSSE2(void) { #if defined(WEBP_USE_SSE2) + WebPMultARGBRow = MultARGBRow; + WebPMultRow = MultRow; + WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPDispatchAlpha = DispatchAlpha; #endif } diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 5306b9b9..a50f0539 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -294,6 +294,11 @@ void WebPMultRows(uint8_t* ptr, int stride, const uint8_t* alpha, int alpha_stride, int width, int num_rows, int inverse); +// Plain-C versions, used as fallback by some implementations. +void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse); +void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse); + // To be called first before using the above. void WebPInitAlphaProcessing(void);