From 6583bb1a42cbee2d156ba286c5a63cc863ea3336 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Fri, 19 Feb 2016 09:11:53 +0100 Subject: [PATCH] Improve SSE4.1 implementation of TTransform. SSE4.1 is slower than the SSE2 implementation and this seems to be due to a slow _mm_loadl_epi64 implementation by gcc (hence a bug with my gcc 4.8) and a very slow _mm_hadd_epi32. Both got confirmed by IACA and experiments. Change-Id: I05607f66b7ccd8f4f42e000693aea583ffd5768f --- src/dsp/enc_sse41.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/dsp/enc_sse41.c b/src/dsp/enc_sse41.c index d817c44a..a1783901 100644 --- a/src/dsp/enc_sse41.c +++ b/src/dsp/enc_sse41.c @@ -72,17 +72,22 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { + int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load and combine inputs. { - const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); - const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); - const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); + const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); + const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); + const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); + // In SSE4.1, with gcc 4.8 at least (maybe other versions), + // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump + // of inA and inB, _mm_loadl_epi64 is still used not to have an out of + // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); - const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); - const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); - const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); + const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); + const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); + const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). @@ -158,11 +163,9 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); - // cascading summation of the differences - B_b0 = _mm_hadd_epi32(A_b2, A_b2); - B_b2 = _mm_hadd_epi32(B_b0, B_b0); - return _mm_cvtsi128_si32(B_b2); + _mm_storeu_si128((__m128i*)&sum[0], A_b2); } + return sum[0] + sum[1] + sum[2] + sum[3]; } static int Disto4x4(const uint8_t* const a, const uint8_t* const b,