rework AddVectorEq_SSE2

Take advantage of the known sizes used by VP8LHistogramAdd() and remove loop for the remainder. The loop was being auto-vectorized making the code larger and slower than the vectorized C code. For larger sizes the new code is ~3-4.5% faster than the old code with about the same improvement against the vectorized C code. For the minimal size (40), the new code is ~30% faster than the C and old SSE2 code. The LINE_SIZE==8 option is removed with this change. It had been set to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8 case by 2 in any case; they both profile similarly. Change-Id: I6dfedfd57474f44d15e2ce510a48e5252221077a
2025-04-05 08:26:51 +02:00 · 2024-11-12 15:13:42 -08:00 · 2024-11-12 15:13:42 -08:00 · 61e2cfdadd
commit 61e2cfdadd
parent 7bda3deb89
1 changed files with 30 additions and 12 deletions
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -224,35 +224,53 @@ static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
  }
 }
 #define LINE_SIZE 16    // 8 or 16
 static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
                             uint32_t* WEBP_RESTRICT out, int size) {
-  int i;
+  int i = 0;
-  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+  int aligned_size = size & ~15;
  // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
  // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
  // 2). See the usage in VP8LHistogramAdd().
  assert(size >= 16);
  assert(size % 2 == 0);
  do {
    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
 #if (LINE_SIZE == 16)
    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
 #endif
    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
 #if (LINE_SIZE == 16)
    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
 #endif
    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
 #if (LINE_SIZE == 16)
    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
+    i += 16;
  } while (i != aligned_size);
  if ((size & 8) != 0) {
    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
    _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
    _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
    i += 8;
  }
-  for (; i < size; ++i) {
+
-    out[i] += a[i];
+  size &= 7;
  if (size == 4) {
    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]);
    _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
  } else if (size == 2) {
    const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
    const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]);
    _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
  }
 }
 #undef LINE_SIZE
 //------------------------------------------------------------------------------
 // Entropy