From 61e2cfdadd295e74db6cd361d5c148b96af6b4b8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 12 Nov 2024 15:13:42 -0800
Subject: [PATCH] rework AddVectorEq_SSE2

Take advantage of the known sizes used by VP8LHistogramAdd() and
remove loop for the remainder. The loop was being auto-vectorized making
the code larger and slower than the vectorized C code.

For larger sizes the new code is ~3-4.5% faster than the old code with
about the same improvement against the vectorized C code. For the
minimal size (40), the new code is ~30% faster than the C and old SSE2
code.

The LINE_SIZE==8 option is removed with this change. It had been set
to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8
case by 2 in any case; they both profile similarly.

Change-Id: I6dfedfd57474f44d15e2ce510a48e5252221077a
---
 src/dsp/lossless_enc_sse2.c | 42 ++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/dsp/lossless_enc_sse2.c b/src/dsp/lossless_enc_sse2.c
index c0a00cfc..b2fa6480 100644
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@@ -224,35 +224,53 @@ static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
   }
 }
 
-#define LINE_SIZE 16    // 8 or 16
 static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
                              uint32_t* WEBP_RESTRICT out, int size) {
-  int i;
-  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+  int i = 0;
+  int aligned_size = size & ~15;
+  // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
+  // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
+  // 2). See the usage in VP8LHistogramAdd().
+  assert(size >= 16);
+  assert(size % 2 == 0);
+
+  do {
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
-#endif
     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
-#if (LINE_SIZE == 16)
     const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
     const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
-#endif
     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
+    i += 16;
+  } while (i != aligned_size);
+
+  if ((size & 8) != 0) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
+    _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
+    i += 8;
   }
-  for (; i < size; ++i) {
-    out[i] += a[i];
+
+  size &= 7;
+  if (size == 4) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]);
+    _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
+  } else if (size == 2) {
+    const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
+    const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]);
+    _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
   }
 }
-#undef LINE_SIZE
 
 //------------------------------------------------------------------------------
 // Entropy