From 44f91b0ddd07b8482c3e5926fba4745bd0969cbe Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Thu, 10 Apr 2025 11:52:42 +0200 Subject: [PATCH] Speed DispatchAlpha_SSE2 up On some dataset, this was taking 2.5%. 2% when switching to _mm_maskmoveu_si128. 1.7% when using _mm_loadu_si128 Confirmed by IACA: going from throughput of 4.26 to 3.5 and then to 6.26 for twice the input. Change-Id: I409f901aaad9d39bf55a1aac28cc25f126876b01 --- src/dsp/alpha_processing_sse2.c | 57 +++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c index aa0cc284..bfe40c30 100644 --- a/src/dsp/alpha_processing_sse2.c +++ b/src/dsp/alpha_processing_sse2.c @@ -16,6 +16,8 @@ #if defined(WEBP_USE_SSE2) #include +#include "src/dsp/cpu.h" + //------------------------------------------------------------------------------ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, @@ -26,38 +28,44 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); - const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00); // to preserve RGB - const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); - __m128i all_alphas = all_0xff; + const __m128i alpha_mask = _mm_set1_epi32((int)0xff); // to preserve A + const __m128i all_0xff = _mm_set1_epi8(0xff); + __m128i all_alphas16 = all_0xff; + __m128i all_alphas8 = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'dst[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. - const int limit = (width - 1) & ~7; - for (j = 0; j < height; ++j) { - __m128i* out = (__m128i*)dst; - for (i = 0; i < limit; i += 8) { + char* ptr = (char*)dst; + for (i = 0; i + 16 <= width - 1; i += 16) { + // load 16 alpha bytes + const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); + const __m128i a1_lo = _mm_unpacklo_epi8(a0, zero); + const __m128i a1_hi = _mm_unpackhi_epi8(a0, zero); + const __m128i a2_lo_lo = _mm_unpacklo_epi16(a1_lo, zero); + const __m128i a2_lo_hi = _mm_unpackhi_epi16(a1_lo, zero); + const __m128i a2_hi_lo = _mm_unpacklo_epi16(a1_hi, zero); + const __m128i a2_hi_hi = _mm_unpackhi_epi16(a1_hi, zero); + _mm_maskmoveu_si128(a2_lo_lo, alpha_mask, ptr + 0); + _mm_maskmoveu_si128(a2_lo_hi, alpha_mask, ptr + 16); + _mm_maskmoveu_si128(a2_hi_lo, alpha_mask, ptr + 32); + _mm_maskmoveu_si128(a2_hi_hi, alpha_mask, ptr + 48); + // accumulate 16 alpha 'and' in parallel + all_alphas16 = _mm_and_si128(all_alphas16, a0); + ptr += 64; + } + if (i + 8 <= width - 1) { // load 8 alpha bytes const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); - // load 8 dst pixels (32 bytes) - const __m128i b0_lo = _mm_loadu_si128(out + 0); - const __m128i b0_hi = _mm_loadu_si128(out + 1); - // mask dst alpha values - const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); - const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); - // combine - const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); - const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); - // store - _mm_storeu_si128(out + 0, b2_lo); - _mm_storeu_si128(out + 1, b2_hi); - // accumulate eight alpha 'and' in parallel - all_alphas = _mm_and_si128(all_alphas, a0); - out += 2; + _mm_maskmoveu_si128(a2_lo, alpha_mask, ptr); + _mm_maskmoveu_si128(a2_hi, alpha_mask, ptr + 16); + // accumulate 8 alpha 'and' in parallel + all_alphas8 = _mm_and_si128(all_alphas8, a0); + i += 8; } for (; i < width; ++i) { const uint32_t alpha_value = alpha[i]; @@ -68,8 +76,9 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, dst += dst_stride; } // Combine the eight alpha 'and' into a 8-bit mask. - alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); - return (alpha_and != 0xff); + alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas8, all_0xff)) & 0xff; + return (alpha_and != 0xff || + _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas16, all_0xff)) != 0xffff); } static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,