From 7bda3deb890a1cc0cac9f8987bdecef5330bfb55 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 12 Nov 2024 14:58:23 -0800 Subject: [PATCH 1/2] rework AddVector_SSE2 Take advantage of the known sizes used by VP8LHistogramAdd() and remove loop for the remainder. The loop was being auto-vectorized making the code larger and slower than the vectorized C code. For larger sizes the new code is ~4-7% faster than the old code with about the same improvement against the vectorized C code. For the minimal size (40), the new code is ~30% faster than the C and old SSE2 code. The LINE_SIZE==8 option is removed with this change. It had been set to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8 case by 2 in any case; they both profile similarly. Change-Id: I2376e2dca3bffa38477b4a432f4c533419e3be0e --- src/dsp/lossless_enc_sse2.c | 42 +++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src/dsp/lossless_enc_sse2.c b/src/dsp/lossless_enc_sse2.c index f6706dd5..c0a00cfc 100644 --- a/src/dsp/lossless_enc_sse2.c +++ b/src/dsp/lossless_enc_sse2.c @@ -175,36 +175,56 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But // that's ok since the histogram values are less than 1<<28 (max picture size). -#define LINE_SIZE 16 // 8 or 16 static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, const uint32_t* WEBP_RESTRICT b, uint32_t* WEBP_RESTRICT out, int size) { - int i; - for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { + int i = 0; + int aligned_size = size & ~15; + // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as + // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of + // 2). See the usage in VP8LHistogramAdd(). + assert(size >= 16); + assert(size % 2 == 0); + + do { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); -#if (LINE_SIZE == 16) const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); -#endif const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); -#if (LINE_SIZE == 16) const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]); const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]); -#endif _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); -#if (LINE_SIZE == 16) _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); -#endif + i += 16; + } while (i != aligned_size); + + if ((size & 8) != 0) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); + _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); + _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); + i += 8; } - for (; i < size; ++i) { - out[i] = a[i] + b[i]; + + size &= 7; + if (size == 4) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]); + _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0)); + } else if (size == 2) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]); + const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]); + _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0)); } } +#define LINE_SIZE 16 // 8 or 16 static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, uint32_t* WEBP_RESTRICT out, int size) { int i; From 61e2cfdadd295e74db6cd361d5c148b96af6b4b8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 12 Nov 2024 15:13:42 -0800 Subject: [PATCH 2/2] rework AddVectorEq_SSE2 Take advantage of the known sizes used by VP8LHistogramAdd() and remove loop for the remainder. The loop was being auto-vectorized making the code larger and slower than the vectorized C code. For larger sizes the new code is ~3-4.5% faster than the old code with about the same improvement against the vectorized C code. For the minimal size (40), the new code is ~30% faster than the C and old SSE2 code. The LINE_SIZE==8 option is removed with this change. It had been set to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8 case by 2 in any case; they both profile similarly. Change-Id: I6dfedfd57474f44d15e2ce510a48e5252221077a --- src/dsp/lossless_enc_sse2.c | 42 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/dsp/lossless_enc_sse2.c b/src/dsp/lossless_enc_sse2.c index c0a00cfc..b2fa6480 100644 --- a/src/dsp/lossless_enc_sse2.c +++ b/src/dsp/lossless_enc_sse2.c @@ -224,35 +224,53 @@ static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, } } -#define LINE_SIZE 16 // 8 or 16 static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, uint32_t* WEBP_RESTRICT out, int size) { - int i; - for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { + int i = 0; + int aligned_size = size & ~15; + // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as + // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of + // 2). See the usage in VP8LHistogramAdd(). + assert(size >= 16); + assert(size % 2 == 0); + + do { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); -#if (LINE_SIZE == 16) const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); -#endif const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); -#if (LINE_SIZE == 16) const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]); const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); -#endif _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); -#if (LINE_SIZE == 16) _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); -#endif + i += 16; + } while (i != aligned_size); + + if ((size & 8) != 0) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); + _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); + _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); + i += 8; } - for (; i < size; ++i) { - out[i] += a[i]; + + size &= 7; + if (size == 4) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]); + _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0)); + } else if (size == 2) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]); + const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]); + _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0)); } } -#undef LINE_SIZE //------------------------------------------------------------------------------ // Entropy