Improve SSE4.1 implementation of TTransform.

SSE4.1 is slower than the SSE2 implementation and this seems to be due to a slow _mm_loadl_epi64 implementation by gcc (hence a bug with my gcc 4.8) and a very slow _mm_hadd_epi32. Both got confirmed by IACA and experiments. Change-Id: I05607f66b7ccd8f4f42e000693aea583ffd5768f
2025-07-23 17:29:58 +02:00 · 2016-02-19 09:11:53 +01:00
parent 7ccdb734c2
commit 6583bb1a42
1 changed files with 13 additions and 10 deletions
--- a/src/dsp/enc_sse41.c
+++ b/src/dsp/enc_sse41.c
@ -72,17 +72,22 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // w[] contains a row-major 4 by 4 symmetric matrix.
 static int TTransform(const uint8_t* inA, const uint8_t* inB,
                      const uint16_t* const w) {
+  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;

  // Load and combine inputs.
  {
-    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
-    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
-    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
+    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
+    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
+    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
+    // bound read.
    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
-    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
-    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
-    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
@ -158,11 +163,9 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,

    // difference of weighted sums
    A_b2 = _mm_sub_epi32(A_b0, B_b0);
-    // cascading summation of the differences
-    B_b0 = _mm_hadd_epi32(A_b2, A_b2);
-    B_b2 = _mm_hadd_epi32(B_b0, B_b0);
-    return _mm_cvtsi128_si32(B_b2);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
 }

 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,